<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
    <channel>
        <title>jaam_mini.log</title>
        <link>https://velog.io/</link>
        <description>비전공자의 데이터 공부법</description>
        <lastBuildDate>Thu, 01 Feb 2024 03:33:55 GMT</lastBuildDate>
        <docs>https://validator.w3.org/feed/docs/rss2.html</docs>
        <generator>https://github.com/jpmonette/feed</generator>
        <image>
            <title>jaam_mini.log</title>
            <url>https://velog.velcdn.com/images/jaam_mini/profile/40e515b7-7b2a-4581-a31f-29004e27e66d/image.png</url>
            <link>https://velog.io/</link>
        </image>
        <copyright>Copyright (C) 2019. jaam_mini.log. All rights reserved.</copyright>
        <atom:link href="https://v2.velog.io/rss/jaam_mini" rel="self" type="application/rss+xml"/>
        <item>
            <title><![CDATA[ML - 15. 자연어 처리 NLP (Natural Language Processing)]]></title>
            <link>https://velog.io/@jaam_mini/ML-15.-%EC%9E%90%EC%97%B0%EC%96%B4-%EC%B2%98%EB%A6%AC-NLP-Natural-Language-Processing</link>
            <guid>https://velog.io/@jaam_mini/ML-15.-%EC%9E%90%EC%97%B0%EC%96%B4-%EC%B2%98%EB%A6%AC-NLP-Natural-Language-Processing</guid>
            <pubDate>Thu, 01 Feb 2024 03:33:55 GMT</pubDate>
            <description><![CDATA[<h3 id="install">install</h3>
<pre><code># 가장 최신 버전으로 유지
!conda update conda
!pip install --upgrade pip
# 한글 자연어 처리 패키지
!pip install konlpy
!pip install tweepy==3.10.0
!conda install -y -c conda-forge jpype1==1.0.2
!conda install -y -c conda-forge wordcloud
!conda install -y nltk
!conda install -y scikit-learn</code></pre><pre><code>import nltk
nltk.download()</code></pre><pre><code>from konlpy.tag import Okt
t = Okt()</code></pre><hr>
<h3 id="kkma">Kkma</h3>
<pre><code>from konlpy.tag import Kkma
kkma = Kkma()</code></pre><pre><code># 문장(sentences)
kkma.sentences(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[&#39;한국어 분석을 시작합니다&#39;, &#39;재미있어요 ~&#39;]</p>
<pre><code># 명사(nouns)
kkma.nouns(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[&#39;한국어&#39;, &#39;분석&#39;]</p>
<pre><code># 형태소분석(pos)
kkma.pos(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[(&#39;한국어&#39;, &#39;NNG&#39;),
 (&#39;분석&#39;, &#39;NNG&#39;),
 (&#39;을&#39;, &#39;JKO&#39;),
 (&#39;시작하&#39;, &#39;VV&#39;),
 (&#39;ㅂ니다&#39;, &#39;EFN&#39;),
 (&#39;재미있&#39;, &#39;VA&#39;),
 (&#39;어요&#39;, &#39;EFN&#39;),
 (&#39;~&#39;, &#39;SO&#39;)]</p>
<hr>
<h3 id="hannanum">Hannanum</h3>
<pre><code>from konlpy.tag import Hannanum
hannanum = Hannanum()</code></pre><pre><code>hannanum.nouns(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[&#39;한국어&#39;, &#39;분석&#39;, &#39;시작&#39;]</p>
<pre><code>hannanum.morphs(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[&#39;한국어&#39;, &#39;분석&#39;, &#39;을&#39;, &#39;시작&#39;, &#39;하&#39;, &#39;ㅂ니다&#39;, &#39;재미있&#39;, &#39;어&#39;, &#39;요&#39;, &#39;~&#39;]</p>
<pre><code>hannanum.pos(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[(&#39;한국어&#39;, &#39;N&#39;),
 (&#39;분석&#39;, &#39;N&#39;),
 (&#39;을&#39;, &#39;J&#39;),
 (&#39;시작&#39;, &#39;N&#39;),
 (&#39;하&#39;, &#39;X&#39;),
 (&#39;ㅂ니다&#39;, &#39;E&#39;),
 (&#39;재미있&#39;, &#39;P&#39;),
 (&#39;어&#39;, &#39;E&#39;),
 (&#39;요&#39;, &#39;J&#39;),
 (&#39;~&#39;, &#39;S&#39;)]</p>
<hr>
<h3 id="okt">Okt</h3>
<pre><code># UserWarning: &quot;Twitter&quot; has changed to &quot;Okt&quot; since KoNLPy v0.4.5. warn(&#39;&quot;Twitter&quot; has changed to &quot;Okt&quot; since KoNLPy v0.4.5.&#39;)
from konlpy.tag import Okt
t = Okt()</code></pre><pre><code>t.nouns(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[&#39;한국어&#39;, &#39;분석&#39;, &#39;시작&#39;]</p>
<pre><code>t.morphs(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[&#39;한국어&#39;, &#39;분석&#39;, &#39;을&#39;, &#39;시작&#39;, &#39;합니다&#39;, &#39;재미있어요&#39;, &#39;~&#39;]</p>
<pre><code>t.pos(&#39;한국어 분석을 시작합니다 재미있어요 ~&#39;)</code></pre><p>[(&#39;한국어&#39;, &#39;Noun&#39;),
 (&#39;분석&#39;, &#39;Noun&#39;),
 (&#39;을&#39;, &#39;Josa&#39;),
 (&#39;시작&#39;, &#39;Noun&#39;),
 (&#39;합니다&#39;, &#39;Verb&#39;),
 (&#39;재미있어요&#39;, &#39;Adjective&#39;),
 (&#39;~&#39;, &#39;Punctuation&#39;)]</p>
<hr>
<h2 id="1워드클라우드-wordcloud">1.워드클라우드 wordcloud</h2>
<ul>
<li>중요하지 않은 영어 단어들을 제거하는 역할</li>
</ul>
<pre><code>from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image</code></pre><pre><code># 소설 읽어오기
text = open(&quot;./15. alice.txt&quot;).read()
# 이미지 읽어오기
alice_mask = np.array(Image.open(&quot;./15. alice_mask.png&quot;))
# said 단어 제거
stopwords = set(STOPWORDS)
stopwords.add(&#39;said&#39;)</code></pre><pre><code>import platform
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

path = &quot;c:/Windows/Fonts/malgun.ttf&quot;

if platform.system() == &quot;Darwin&quot;:
  print(&quot;Hangle OK in your MAC!!!&quot;)
  rc(&quot;font&quot;, family=&quot;AppleGothic&quot;)
elif platform.system() == &quot;Windows&quot;:
  font_name = font_manager.FontProperties(fname=path).get_name()
  print(&quot;Hangle OK in your Windows!!!&quot;)
  rc(&quot;font&quot;, family=font_name)
else:
  print(&quot;Sorry, Unkwnown System&quot;)

plt.rcParams[&quot;axes.unicode_minus&quot;] = False</code></pre><p>Hangle OK in your Windows!!!</p>
<pre><code>plt.figure(figsize=(8,8))
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation=&#39;bilinear&#39;)
# plt.axis(&#39;off&#39;)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/bea02aa1-36bf-4db6-9780-e77210e794fc/image.png" alt=""></li>
</ul>
<pre><code># WordCloud 모듈은 자체적으로 단어를 추출해서 빈도수를 조사하고 정규화하는 기능을 가지고 있다
wc = WordCloud(
    background_color=&#39;white&#39;, max_words=2000, mask=alice_mask, stopwords=stopwords
)
wc = wc.generate(text)
wc.words_</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a70064ba-52eb-4feb-843e-fa57248d5553/image.png" alt=""></li>
</ul>
<pre><code>plt.figure(figsize=(8,8))
plt.imshow(wc,interpolation=&#39;bilinear&#39;)
# plt.axis(&#39;off&#39;)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/8c40bbb8-0c73-40c5-bcbf-1bc649f21fdf/image.png" alt=""></li>
</ul>
<hr>
<h3 id="twitter">Twitter</h3>
<pre><code>import nltk
from konlpy.corpus import kobill

files_ko = kobill.fileids()
doc_ko = kobill.open(&#39;1809890.txt&#39;).read()
doc_ko</code></pre><p>명사분석</p>
<pre><code>from konlpy.tag import Okt

t = Okt()
token_ko = t.nouns(doc_ko)
token_ko # 명사 단어들의 집합</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/7b7cbb32-3562-48f8-84be-372bbb758097/image.png" alt=""></li>
</ul>
<p>빈도수분석</p>
<pre><code>ko = nltk.Text(token_ko, name=&#39;육아휴직법&#39;)

#token_ko : 명사 단어들의 집합</code></pre><pre><code># 이 문자열 단위 : token
print(len(ko.tokens)) # 사용된 단어들
print(len(set(ko.tokens))) # 중복 제외 단어들
ko.vocab() # 어떤 단어들이 있나요? : vocab(단어의 집합)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/50902ad5-5b1e-4343-92ee-5d5e451341f4/image.png" alt=""></li>
</ul>
<pre><code>plt.figure(figsize=(12,6))
ko.plot(50)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/2b007c63-2730-411e-bf19-a0aad07f9a7d/image.png" alt=""></li>
</ul>
<pre><code># 제거할 글자들
stop_words = [
    &#39;의&#39;,    &#39;.&#39;,    &#39;(&#39;,    &#39;)&#39;,    &#39;,&#39;,    &#39;%&#39;,    &#39;-&#39;,    &#39;X&#39;,    &#39;).&#39;,    &#39;x&#39;,    &#39;의&#39;,
    &#39;안&#39;,    &#39;번&#39;,    &#39;호&#39;,    &#39;발&#39;,    &#39;의&#39;,    &#39;자&#39;,    &#39;가&#39;,    &#39;를&#39;,    &#39;만&#39;,    &#39;을&#39;,
    &#39;다&#39;,    &#39;인&#39;,    &#39;김&#39;,    &#39;태&#39;,    &#39;완&#39;,    &#39;및&#39;,    &#39;정&#39;,    &#39;문&#39;,    &#39;종&#39;,    &#39;팀&#39;,
    &#39;장&#39;,    &#39;위&#39;,    &#39;의 &#39;,    &#39;호&#39;]
ko = [each_word for each_word in ko if each_word not in stop_words]
ko</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/2172b55e-c099-48f9-b55f-a6b9a9c3b84f/image.png" alt=""><pre><code>ko = nltk.Text(ko, name=&#39;대한민국 국회 의안 제 1809890호&#39;)
plt.figure(figsize=(12,6))
ko.plot(50)
plt.show()</code></pre></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/631b7c09-f815-4112-b20c-7c4baa114d44/image.png" alt=""></li>
</ul>
<p>특정단어 빈도수 조사/조회</p>
<pre><code>ko.count(&#39;고용&#39;)</code></pre><p>14</p>
<pre><code>plt.figure(figsize=(12,6))
ko.dispersion_plot([&#39;육아휴직&#39;,&#39;자녀&#39;,&#39;고용&#39;]) # dispersion_plot : 어디쯤에 위치한지 알려줌</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/e590d9aa-6210-4e91-849a-bd85855670f4/image.png" alt=""></li>
</ul>
<pre><code>ko.concordance(&#39;고용&#39;) # 좌우 글자를 보여줘, 문맥을 파악하는데 도움을 줌</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/c9064c5e-c68a-4736-8d8d-4f834e082611/image.png" alt=""></li>
</ul>
<p>연관있어보이는 단어들 출력</p>
<pre><code>ko.collocations()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/8bcfc368-87a8-4333-9ece-40402dae9af3/image.png" alt=""></li>
</ul>
<p>워드클라우드 출력</p>
<pre><code>
data = ko.vocab().most_common(150)

# WordCloud 모듈은 자체적으로 단어를 추출해서 빈도수를 조사하고 정규화하는 기능을 가지고 있다
wordcloud = WordCloud(
    font_path = &quot;c:/Windows/Fonts/malgun.ttf&quot;,
    relative_scaling=0.2, # 글자 간격
    background_color=&#39;white&#39;
).generate_from_frequencies(dict(data))

plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis(&#39;off&#39;)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/b5012fe2-6c51-4e03-a749-c4131c2c05ef/image.png" alt=""></li>
</ul>
<hr>
<h2 id="2-나이브베이즈-분류">2. 나이브베이즈 분류</h2>
<blockquote>
<p>나이브베이즈 분류를 이용한 감성분석</p>
</blockquote>
<p>✍🏻 영어 ver.</p>
<pre><code>from nltk.tokenize import word_tokenize
import nltk</code></pre><pre><code>train = [
    (&#39;i like you&#39;, &#39;pos&#39;),
    (&#39;i hate you&#39;, &#39;neg&#39;),
    (&#39;you like me&#39;, &#39;neg&#39;),
    (&#39;i like her&#39;, &#39;pos&#39;)
]</code></pre><p>말뭉치 만들기</p>
<pre><code>train[0]</code></pre><p>&#39;i like you&#39;</p>
<pre><code>sentence = train[0]
word_tokenize(sentence[0]) #word_tokenize : 글자 분리</code></pre><p>[&#39;i&#39;, &#39;like&#39;, &#39;you&#39;]</p>
<pre><code># set 명령으로 인해 중복 없이 출력
all_words = set(
    word.lower() for sentence in train for word in word_tokenize(sentence[0])
)
all_words</code></pre><p>{&#39;hate&#39;, &#39;her&#39;, &#39;i&#39;, &#39;like&#39;, &#39;me&#39;, &#39;you&#39;}</p>
<p></br></br></p>
<p>말 뭉치에서 각 단어 유무 파악</p>
<ol>
<li>train 에서 x(=문장)를 하나씩 가져올 것임</li>
<li>x[0](첫문장의 첫번쨰 것)을 띄어쓰기로 분리(word_tokenize)하고,</li>
<li>all_words에 있는 모든 단어(word)를 가지고 2.에 있는지 확인</li>
</ol>
<pre><code>t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t</code></pre><p>[({&#39;her&#39;: False,
   &#39;i&#39;: True,
   &#39;me&#39;: False,
   &#39;like&#39;: True,
   &#39;you&#39;: True,
   &#39;hate&#39;: False},
  &#39;pos&#39;),
 ({&#39;her&#39;: False,
   &#39;i&#39;: True,
   &#39;me&#39;: False,
   &#39;like&#39;: False,
   &#39;you&#39;: True,
   &#39;hate&#39;: True},
  &#39;neg&#39;),
 ({&#39;her&#39;: False,
   &#39;i&#39;: False,
   &#39;me&#39;: True,
   &#39;like&#39;: True,
   &#39;you&#39;: True,
   &#39;hate&#39;: False},
  &#39;neg&#39;),
 ({&#39;her&#39;: True,
   &#39;i&#39;: True,
   &#39;me&#39;: False,
   &#39;like&#39;: True,
   &#39;you&#39;: False,
   &#39;hate&#39;: False},
  &#39;pos&#39;)]</p>
<p>훈련 시작</p>
<ul>
<li>like가 있을 때 positive할 확률이 1.7 : 1 </li>
</ul>
<pre><code>classifier = nltk.NaiveBayesClassifier.train(t) # 학습
classifier.show_most_informative_features() # 가장 많은 정보를 담고 있는 특성을 나열</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/92902ced-1cff-444e-969f-0962a958f154/image.png" alt=""></li>
</ul>
<p>테스트 시작</p>
<pre><code>test_sentence = &quot;i like MeRui&quot;
test_sent_features = {
    word.lower(): (word in word_tokenize(test_sentence.lower())) for word in all_words
}
test_sent_features</code></pre><p>{&#39;her&#39;: False,
 &#39;i&#39;: True,
 &#39;me&#39;: False,
 &#39;like&#39;: True,
 &#39;you&#39;: False,
 &#39;hate&#39;: False}</p>
<p> 결과</p>
<pre><code>classifier.classify(test_sent_features)</code></pre><p>&#39;pos&#39;</p>
<p></br></br></br></br></p>
<p>✍🏻 한글 ver.</p>
<pre><code>from konlpy.tag import Okt

pos_tagger = Okt()

train = [
    (&#39;메리가 좋아&#39;,&#39;pos&#39;),
    (&#39;고양이도 좋아&#39;,&#39;pos&#39;),
    (&#39;난 수업이 지루해&#39;,&#39;neg&#39;),
    (&#39;메리는 이쁜 고양이야&#39;, &#39;pos&#39;),
    (&#39;난 마치고 메리랑 놀거야&#39;,&#39;pos&#39;)
]

all_words = set(
    word.lower() for sentence in train for word in word_tokenize(sentence[0])
)

t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]

classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

test_sentence = &quot;난 수업이 마치면 메리랑 놀거야&quot;
test_sent_features = {
    word.lower(): (word in word_tokenize(test_sentence.lower())) for word in all_words
}
test_sent_features</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/1c5195af-955f-4c56-815f-05c1596adf43/image.png" alt=""></li>
</ul>
<pre><code>classifier.classify(test_sent_features)</code></pre><p>&#39;neg&#39;</p>
<p></br></br></p>
<p>Negative가 떴으니, 형태소 분석을 통해 정확히 맞혀보자</p>
<h3 id="형태소분석">형태소분석</h3>
<p>형태소 분석을 한 뒤 품사를 단어 뒤에 붙여봄</p>
<pre><code>def tokenize(doc):
    return[&quot;/&quot;.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]</code></pre><pre><code>train_docs = [(tokenize(row[0]), row[1]) for row in train]
train_docs</code></pre><p>[([&#39;메리/Noun&#39;, &#39;가/Josa&#39;, &#39;좋다/Adjective&#39;], &#39;pos&#39;),
 ([&#39;고양이/Noun&#39;, &#39;도/Josa&#39;, &#39;좋다/Adjective&#39;], &#39;pos&#39;),
 ([&#39;난/Noun&#39;, &#39;수업/Noun&#39;, &#39;이/Josa&#39;, &#39;지루하다/Adjective&#39;], &#39;neg&#39;),
 ([&#39;메리/Noun&#39;, &#39;는/Josa&#39;, &#39;이쁘다/Adjective&#39;, &#39;고양이/Noun&#39;, &#39;야/Josa&#39;], &#39;pos&#39;),
 ([&#39;난/Noun&#39;, &#39;마치/Noun&#39;, &#39;고/Josa&#39;, &#39;메리/Noun&#39;, &#39;랑/Josa&#39;, &#39;놀다/Verb&#39;], &#39;pos&#39;)]</p>
<p>말뭉치 만들기</p>
<pre><code>tokens = [t for d in train_docs for t in d[0]]
tokens</code></pre><p>[&#39;메리/Noun&#39;,
 &#39;가/Josa&#39;,
 &#39;좋다/Adjective&#39;,
 &#39;고양이/Noun&#39;,
 &#39;도/Josa&#39;,
 &#39;좋다/Adjective&#39;,
 &#39;난/Noun&#39;,
 &#39;수업/Noun&#39;,
 &#39;이/Josa&#39;,
 &#39;지루하다/Adjective&#39;,
 &#39;메리/Noun&#39;,
 &#39;는/Josa&#39;,
 &#39;이쁘다/Adjective&#39;,
 &#39;고양이/Noun&#39;,
 &#39;야/Josa&#39;,
 &#39;난/Noun&#39;,
 &#39;마치/Noun&#39;,
 &#39;고/Josa&#39;,
 &#39;메리/Noun&#39;,
 &#39;랑/Josa&#39;,
 &#39;놀다/Verb&#39;]</p>
<pre><code>def term_exists(doc):
    return{word: (word in set(doc)) for word in tokens}

train_xy = [(term_exists(d),c) for d,c in train_docs]
train_xy</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/b1bb1b75-1268-4d11-95d5-fb9e457a59a6/image.png" alt=""></li>
</ul>
<pre><code>classifier = nltk.NaiveBayesClassifier.train(train_xy)

test_sentence = &quot;난 수업이 마치면 메리랑 놀거야&quot;
test_docs = pos_tagger.pos(test_sentence[0])
test_docs

classifier.show_most_informative_features()

test_sent_features = {word: (word in tokens) for word in test_docs}
test_sent_features

classifier.classify(test_sent_features)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/42532df7-db14-43df-8c9d-c75e52b9afff/image.png" alt=""></li>
</ul>
<hr>
<h2 id="3-문장의-유사도-측정">3. 문장의 유사도 측정</h2>
<h3 id="count-vectorize">count vectorize</h3>
<h3 id="tfidf-vectorize">tfidf vectorize</h3>
<h3 id="네이버-api를-통해-유사-질문-찾기">네이버 API를 통해 유사 질문 찾기</h3>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 14. mini project _ CREDIT CARD FRAUD DETECTION]]></title>
            <link>https://velog.io/@jaam_mini/ML-14.-mini-project-CREDIT-CARD-FRAUD-DETECTION</link>
            <guid>https://velog.io/@jaam_mini/ML-14.-mini-project-CREDIT-CARD-FRAUD-DETECTION</guid>
            <pubDate>Wed, 31 Jan 2024 09:02:30 GMT</pubDate>
            <description><![CDATA[<h3 id="프로젝트-소개">프로젝트 소개</h3>
<ul>
<li><p>주제 : 신용카드 부정 사용자 검출</p>
</li>
<li><p>데이터 : <a href="https://www.kaggle.com/MLG-ULB/CREDITCARDFRAUD">https://www.kaggle.com/MLG-ULB/CREDITCARDFRAUD</a></p>
</li>
<li><p>개념</p>
<ul>
<li>신용카드와 같은 금융데이터들은 구하기가 어려움</li>
<li>금융 데이터들의 데이터는 또한 다루기 쉽지 않음</li>
<li>그러나 지능화되어가는 현대 범죄에 맞춰 사전 이상 징후 검출 등 금융 기관이 많은 노력을 기울이고 있음
이 데이터 역시 센서를 이용한 사람의 행동 과정 유추처럼 머신러닝의 이용 분야 중 하나</li>
</ul>
</li>
<li><p>개요</p>
<ul>
<li><p>신용카드 사기 검출 분류용 데이터</p>
</li>
<li><p>데이터에 class라는 이름의 컬럼이 사기 유무를 의미</p>
</li>
<li><p>calss 컬럼의 불균형이 극심해서 전체 데이터의 약 0.172%가 1(사기 Fraud)를 가짐</p>
<ul>
<li>Class : Fraud 여유 (1 이면 Fraud)</li>
<li>Amount : 거래금액</li>
</ul>
<p></br></br></br></br></p>
</li>
</ul>
</li>
</ul>
<h2 id="데이터-확인">데이터 확인</h2>
<hr>
<pre><code class="language-py"># 1) 데이터 읽기
import pandas as pd

data_path = &#39;./14. mini project_creditcard.csv&#39;
raw_data = pd.read_csv(data_path)
raw_data.head()</code></pre>
<pre><code class="language-py"># 2) 특성

raw_data.columns.values</code></pre>
<pre><code class="language-py"># 3) 데이터 라벨 확인 (Class : 사기 유무)

raw_data[&#39;Class&#39;].value_counts()</code></pre>
<p>Frauds 0.17 % of the dataset</p>
<pre><code class="language-py"># 5) 데이터 선정
X = raw_data.iloc[:, 1:-1] # Time, Class 컬럼 제외
y = raw_data.iloc[:, -1] # 모든 행의 마지막 컬럼을 선택

X.shape, y.shape</code></pre>
<p>((284807, 29), (284807,))
</br></br>
train_test_split 의 인자들 (<a href="https://wikidocs.net/193722">https://wikidocs.net/193722</a>)</p>
<ul>
<li>stratify=y로 지정하면 레이블 데이터 y에 따라 학습 데이터셋과 테스트 데이터셋의 클래스 비율이 유지</li>
<li>random_state : 데이터를 나눌 때 사용되는 난수 시드, 이 값을 지정하지 않으면, 매번 실행할 때마다 다른 결과를 얻을 수 있음</li>
<li>test_size=0.2로 지정하면 전체 데이터셋의 20%를 테스트 데이터셋으로 사용</li>
<li>train_size : 학습 데이터셋의 크기 결정 (기본값은 None으로, 학습 데이터셋 크기를 1 - test_size로 결정)</li>
<li>shuffle : 데이터를 섞을지 여부를 결정<pre><code class="language-py"># 6) 데이터 나누기
</code></pre>
</li>
</ul>
<p>from sklearn.model_selection import train_test_split</p>
<p>X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)</p>
<pre><code>```py
# 7) 나눈 데이터의 불균형 정도 확인 (y_train 원소의 갯수 세기)

import numpy as np

# 원소의 갯수 세기 : unique + return_counts
# return_counts=True : 각 원소의 중복 갯수가 담긴 배열이 반환/원소가 각각 몇개 존재하는지 확인
tmp = np.unique(y_train, return_counts=True)

tmp, tmp[1], tmp[1]/len(y_train)*100</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/fcc4f880-3ae0-4fff-96cf-a813fe450208/image.png" alt=""></li>
</ul>
<pre><code class="language-py"># 8) 나눈 데이터의 불균형 정도 확인 (y_test 원소의 갯수 세기)

import numpy as np

tmp = np.unique(y_test, return_counts=True)

tmp, tmp[1], tmp[1]/len(y_test)*100 # %를 구한 것</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/7ae6da14-c62c-4933-ac70-30b7e423dab7/image.png" alt=""></li>
</ul>
<h3 id="무식-ver-데이터-분석">(무식 ver.) 데이터 분석</h3>
<hr>
<pre><code class="language-py"># 1) 분류기 성능 return 함수 설정

from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score)

def get_clf_eval(y_test, pred):
    acc = accuracy_score(y_test, pred) # 정확도 : 정확하게 정답을 맞힌 비중
    pre = precision_score(y_test, pred) # 정밀도 : positive 예측치 중 실제 positive 관측치 비중
    re = recall_score(y_test, pred) # 재현율 : positive 관측치 중에서 실제로 예측된 비중
    f1 = f1_score(y_test, pred) # Precision과 recall의 조화평균(정밀도, 재현율 -&gt; 평균)
    auc = roc_auc_score(y_test, pred) # 모델의 성능

    return acc, pre, re, f1, auc



from sklearn.metrics import confusion_matrix

def print_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    acc, pre, re, f1, auc = get_clf_eval(y_test, pred)

    print(&#39;=&gt; confusion metrix&#39;)
    print(confusion)
    print(&#39;==================&#39;)

    print(&#39;Accuracy : {0:.4f}, Precision : {1:.4f} &#39;.format(acc, pre))
    print(&#39;Recall : {0:.4f}, F1 : {1:.4f}, AUC : {2:.4f} &#39;.format(re, f1, auc))

# (https://coduking.com/entry/ROC-curve-AUC-%EA%B0%9C%EB%85%90-%EB%B0%8F-sklearn-%EC%BD%94%EB%93%9C)
# (https://coduking.com/entry/%EB%B6%84%EB%A5%98%EB%AC%B8%EC%A0%9C-%EC%84%B1%EB%8A%A5%ED%8F%89%EA%B0%80-%EC%A7%80%ED%91%9C-Accuracy-Recall-Precision-F1-score-titanic-%EC%8B%A4%EC%8A%B5)</code></pre>
<pre><code class="language-py"># 2) Logistic Regression

from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=13, solver=&#39;liblinear&#39;)
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)

print_clf_eval(y_test, lr_pred)</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/e8d9ce3c-e2d8-4a90-8df5-9fee435a726a/image.png" alt=""></li>
</ul>
<p>Accuracy가 99.92%로 보이지만,
실제 1중에서 몇개를 맞췄는지 보는 Recall 의 값이 59%에 불과함
-&gt; Fraud 검출을 못했다고 봐야 함
-&gt; 더 성능을 끌어 올려야 함</p>
<pre><code class="language-py"># 3) Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

print_clf_eval(y_test, dt_pred)</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/94f55428-6f74-4161-bb6f-3d20b675f788/image.png" alt=""></li>
</ul>
<p>DecisionTreeClassifier 의 결과는 106개 중 42개가 틀렸고 71.62% 로 나옴. 
이전 보다 높음</p>
<pre><code class="language-py"># 4) Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1, n_estimators=100)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)

print_clf_eval(y_test, rf_pred)</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/b12e62bd-0b3d-4d18-85ac-b48a4a51d365/image.png" alt=""></li>
</ul>
<p>recall이 조금더 올라감.
이전보다 덜 틀린 38개</p>
<pre><code class="language-py"># 5) LightGBM

from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)
lgbm_clf.fit(X_train, y_train)
lgbm_pred = lgbm_clf.predict(X_test)

print_clf_eval(y_test, lgbm_pred)</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/4c7e7025-65ef-4e3d-8bb5-4d6ebbbc3603/image.png" alt=""></li>
</ul>
<p>성능이 조-금 좋아진 느낌.</p>
<h3 id="한걸음-전진ver-분석">(한걸음 전진ver.) 분석</h3>
<ul>
<li>은행 입장에서는 Recall이 좋을 것이다.</li>
<li>사용자 입장에서는 Precision이 좋겠지.</li>
<li>왜?  -&gt;&gt;</br></li>
<li>get_clf_eval : 성능지표</li>
</ul>
<pre><code class="language-py"># 1) 모델, 데이터를 주고 성능을 출력하는 함수

def get_result(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    return get_clf_eval(y_test, pred)</code></pre>
<pre><code class="language-py"># 2) 여러개 모델의 성능을 정리 -&gt; DataFrame 반환

def get_result_pd(models, model_names, X_train, y_train, X_test, y_test):
    col_name = [&#39;accuracy&#39;, &#39;precision&#39;, &#39;recall&#39;, &#39;f1&#39;, &#39;roc_auc&#39;]
    tmp = []

    for model in models:
        tmp.append(get_result(model, X_train, y_train, X_test, y_test))

    return pd.DataFrame(tmp, columns=col_name, index=model_names)</code></pre>
<pre><code class="language-py"># 3) 4개의 분류모델 &gt; 표 (정리)

import time

models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = [&#39;LinearReg&#39;, &#39;DecisionTree&#39;, &#39;RandomForest&#39;, &#39;LightGBM&#39;]

start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

print(&#39;Fit time : &#39;, time.time() - start_time)
results</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/52ee0276-050a-49f8-9594-990470ffb21e/image.png" alt=""></li>
</ul>
<h3 id="데이터-정리--크기-조정--scaling-ver-분석">(데이터 정리 &amp; 크기 조정 / scaling ver.) 분석</h3>
<ul>
<li>그래프로 &#39;Amount&#39;의 값이 어떻게 분포되어 있는지 확인</li>
</ul>
<pre><code class="language-py"># 1) raw_data의 Amount 컬럼 확인

plt.figure(figsize=(10,5))
sns.distplot(raw_data[&#39;Amount&#39;], color=&#39;b&#39;)
plt.show()</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/eedeea4f-a068-4fac-9254-7a29af44933c/image.png" alt=""></li>
</ul>
<pre><code class="language-py">raw_data[&#39;Amount&#39;].values</code></pre>
<p>array([149.62,   2.69, 378.66, ...,  67.88,  10.  , 217.  ])</p>
<pre><code class="language-py">raw_data[&#39;Amount&#39;].values.reshape(-1,1)
# reshape : https://domybestinlife.tistory.com/149</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/cd200c14-ec0a-4870-a476-36ec6f662f8e/image.png" alt=""></li>
</ul>
<pre><code class="language-py">raw_data.iloc[:, 1:-2] # Time, Amount, Class 삭제</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/563c8658-509c-40d1-b2e8-4b3c5e0efa74/image.png" alt=""></li>
</ul>
<p>StandardScaler 를 통해,
&#39;Amount&#39;가 몰려있는 상태의 편형성을 바꿔보고 싶음</p>
<pre><code class="language-py"># 2) Amount + StandardScaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
amount_n = scaler.fit_transform(raw_data[&#39;Amount&#39;].values.reshape(-1,1))

raw_data_copy = raw_data.iloc[:, 1:-2]
raw_data_copy[&#39;Amount_Scaler&#39;] = amount_n # StandardScaler 학습한 컬럼 생성
raw_data_copy.head()</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/d1e66b8c-53b4-47e5-8036-38b2d69f85ee/image.png" alt=""></li>
</ul>
<pre><code class="language-py"># 3) 데이터 나누기 &gt;&gt; 재평가

X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3, random_state=13, stratify=y)

models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = [&#39;LinearReg&#39;, &#39;DecisionTree&#39;, &#39;RandomForest&#39;, &#39;LightGBM&#39;]

start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

print(&#39;Fit time : &#39;, time.time() - start_time)
results</code></pre>
<ul>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/bd822b91-555e-4e55-a068-1aa5f596d720/image.png" alt=""></p>
</li>
<li><p>models 안에 model이 넘어올건데,
그 모델에다가 predict를 시키고, predict_proba를 X_test에 대해서 시켜 줌
왜? ROC 커브를 그리려면 &#39;확률값(predict_proba)&#39;이 있어야 하기 때문</p>
</li>
<li><p>대각선 그리는 방법 :  plt.plot([0,1], [0,1], &#39;k--&#39;, label=&#39;random quess&#39;)</p>
</li>
</ul>
<pre><code class="language-py"># 4) 모델별 ROC 커브

from sklearn.metrics import roc_curve

def draw_roc_curve(models, model_names, X_test, y_test):
    plt.figure(figsize=(10,10))

    for model in range(len(models)):
        pred = models[model].predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, pred)
        plt.plot(fpr, tpr, label=model_names[model])
    # 대각선
    plt.plot([0,1], [0,1], &#39;k--&#39;, label=&#39;random quess&#39;)
    plt.title(&#39;ROC&#39;)
    plt.legend()
    plt.grid()
    plt.show()

# draw_roc_curve(models, model_names, X_test, y_test)
</code></pre>
<pre><code class="language-py">from sklearn.metrics import roc_curve

def draw_roc_corve(models,model_names,  X_test, y_test):
    plt.figure(figsize=(10, 10))

    for model in range(len(models)):
        pred = models[model].predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, pred)
        plt.plot(fpr, tpr, label=model_names[model])

    plt.plot([0, 1], [0, 1], &#39;k--&#39;, label=&#39;random quess&#39;)
    plt.title(&#39;ROC&#39;)
    plt.legend()
    plt.grid()
    plt.show()

draw_roc_corve(models, model_names, X_test, y_test)</code></pre>
<ul>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/58bad469-a19e-4401-a314-9bb6d3323793/image.png" alt=""></p>
</li>
<li><p>log 함수를 적용해 보겠음!
log 함수 : 높은 값은 상대적으로 낮게 잡아주고 낮은 값은 그대로 사용</p>
</li>
</ul>
<pre><code class="language-py"># 5) log scale 확인

amount_log = np.log1p(raw_data[&#39;Amount&#39;])

raw_data_copy[&#39;Amount_Scaler&#39;] = amount_log
raw_data_copy.head()</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/8a13da07-cea3-4a61-97c6-5ee101b8e4a5/image.png" alt=""></li>
</ul>
<pre><code class="language-py"># 6) 분포(displot) 확인
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,5))
sns.distplot(raw_data_copy[&#39;Amount_Scaler&#39;], color=&#39;r&#39;)
plt.show()</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/2064f81a-340c-4878-9d65-8e002e014a3c/image.png" alt=""><pre><code class="language-py"># 7) 성능 확인
</code></pre>
</li>
</ul>
<p>X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3, random_state=13, stratify=y)
start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)</p>
<p>print(&#39;Fit time : &#39;, time.time() - start_time)
results</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/5f34daa2-2e82-46c4-8875-76cbc0e35e01/image.png)

### 데이터의 Outlier를 정리


```py
# 1) 특이한 데이터 확인
import seaborn as sns

plt.figure(figsize=(10,7))
sns.boxplot(data=raw_data[[&#39;V13&#39;, &#39;V14&#39;, &#39;V15&#39;]]);
</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/d3c244e1-ae39-4f23-830a-0de9dd0a7864/image.png" alt=""></p>
<pre><code class="language-py"># 2) Outlier를 정리하기 위해 Outlier의 인덱스를 파악하는 코드

def get_outlier(df=None, column=None, weight=1.5):
    fraud = df[df[&#39;Class&#39;]==1][column]
    # 25% 지점
    quantile_25 = np.percentile(fraud.values, 25)
    # 75% 지점
    quantile_75 = np.percentile(fraud.values, 75)

    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight # (weight = 1.5)
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight
    # 제거할 outlier_index 를 설정
    outlier_index = fraud[(fraud &lt; lowest_val) | (fraud &gt; highest_val)].index

    return outlier_index</code></pre>
<pre><code class="language-py"># 3) 파악하는 코드 작성했으니, Outlier 찾기

get_outlier(df=raw_data, column=&#39;V14&#39;, weight=1.5)</code></pre>
<p>Index([8296, 8615, 9035, 9252], dtype=&#39;int64&#39;)</p>
<pre><code class="language-py"># 4) Outlier 제거 전에 전체 개수 확인
raw_data_copy.shape</code></pre>
<p>(284807, 29)</p>
<pre><code class="language-py"># 5) Outlier 제거
outlier_index = get_outlier(df=raw_data, column=&#39;V14&#39;, weight=1.5)
raw_data_copy.drop(outlier_index, axis=0, inplace=True) # 행제거 (axis=0)
raw_data_copy.shape</code></pre>
<p>(284803, 29)</p>
<pre><code class="language-py"># 6) Outlier 제거 후 데이터 다시 나누기 &gt;&gt; 재평가
X = raw_data_copy

raw_data.drop(outlier_index, axis=0, inplace=True)
y = raw_data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3, random_state=13, stratify=y)

models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = [&#39;LinearReg&#39;, &#39;DecisionTree&#39;, &#39;RandomForest&#39;, &#39;LightGBM&#39;]

start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

print(&#39;Fit time : &#39;, time.time() - start_time)
results</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/8a0ac954-906d-4e64-a224-a99da951be7f/image.png" alt=""></li>
</ul>
<h3 id="smote-oversampling">SMOTE Oversampling</h3>
<ul>
<li>데이터의 불균형이 극심할 때 불균형한 두 클래스의 분포를 강제로 맞춰보는 작업</li>
<li>언더샘플링 : 많은 수의 데이터를 적은 수의 데이터로 강제로 조정</li>
<li>오버샘플링 :<ul>
<li>원본데이터의 피처 값들을 아주 약간 변경하여 증식</li>
<li>대표적으로 SMOTE(Synthetic Minority Over-sampling Technique) 방법이 있음</li>
<li>적은 데이터 세트에 있는 개별 데이터를 k-최근접이웃 방법으로 찾아서 데이터의 분포 사이에 새로운 데이터를 만드는 방식</li>
<li>imbalanced-learn 이라는 Python pkg가 있음<pre><code class="language-py">!pip install imbalanced-learn</code></pre>
</li>
</ul>
</li>
</ul>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 13. GBM - Gradient Boosting Machine]]></title>
            <link>https://velog.io/@jaam_mini/ML-13.-GBM-Gradient-Boosting-Machine</link>
            <guid>https://velog.io/@jaam_mini/ML-13.-GBM-Gradient-Boosting-Machine</guid>
            <pubDate>Wed, 31 Jan 2024 01:08:48 GMT</pubDate>
            <description><![CDATA[<p>[이번에 사용한 데이터] : HAR_dataset</p>
<pre><code class="language-py">import pandas as pd
url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt&#39;

# &#39;\s+&#39; 공백, header 그대로, 컬럼 이름 names
feature_name_df = pd.read_csv(url, sep=&#39;\s+&#39;, header=None, names=[&#39;columns_index&#39;,&#39;columns_name&#39;])

# 밸류만 가지고 feature_name 추출 -&gt; 즉, 앞으로 561개의 이름만 저장하게 됨
feature_name = feature_name_df.iloc[:, 1].values.tolist()

X_train_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/X_train.txt&#39;
X_test_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/X_test.txt&#39;
X_train = pd.read_csv(X_train_url, sep=&#39;\s+&#39;, header=None)
X_test = pd.read_csv(X_test_url, sep=&#39;\s+&#39;, header=None)

X_train.columns = feature_name
X_test.columns = feature_name

y_train_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/y_train.txt&#39;
y_test_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/y_test.txt&#39;
y_train = pd.read_csv(y_train_url, sep=&#39;\s+&#39;, header=None, names=[&#39;action&#39;])
y_test = pd.read_csv(y_test_url, sep=&#39;\s+&#39;, header=None, names=[&#39;action&#39;])
</code></pre>
<h1 id="gbm">GBM</h1>
<hr>
<p>GBM - Gradient Boosting Machine</p>
<ul>
<li>부스팅 알고리즘은 여러 개의 약한 학습기(week learner)를 순차적으로 학습-예측하면서 
잘못 예측한 데이터에 가중치를 부여해서 오류를 개선해가는 방식</li>
<li>GBM은 가중치를 업데이트할 때 경사 하강법(Gradient Descent)을 이용하는 것이 큰 차이</li>
</ul>
<pre><code class="language-py">from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings

warnings.filterwarnings(&#39;ignore&#39;)</code></pre>
<pre><code class="language-py"># GradientBoostingClassifier
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)

print(&#39;ACC : &#39;, accuracy_score(y_test, gb_pred))
print(&#39;Fit time : &#39;, time.time() - start_time)</code></pre>
<p>다른 분들은 40분만에 결과를 볼 수 있었다고 했는데..
난 저녁 약속을 다녀와도 계속 running 중이라 멈출 수 밖에 없었다....🙄</p>
<pre><code class="language-py"># GridSearch로 조금 더 찾아보자~
from sklearn.model_selection import GridSearchCV

params = {
    &#39;n_estimators&#39; : [100,500],    &#39;learning_rate&#39; : [0.05, 0.1]
}

start_time = time.time()
grid = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)
print(&#39;Fit time : &#39;, time.time() - start_time)</code></pre>
<pre><code class="language-py"># test 성능
accuracy_score(y_test, grid.best_estimator_.predict(X_test))</code></pre>
<pre><code class="language-py"># test 성능
accuracy_score(y_test, grid.best_estimator_.predict(X_test))</code></pre>
<h1 id="xgboost">XGBoost</h1>
<hr>
<ul>
<li>XGBoost는 트리 기반의 앙상블 학습에서 가장 각광받는 알고리즘 중 하나</li>
<li>GBM 기반의 알고리즘인데, GBM의 느린 속도를 다양한 규제를 통해 해결</li>
<li>특히 병렬 학습이 가능하도록 설계됨</li>
<li>XGBoost는 반복 수행 시마다 내부적으로 학습데이터와 검증데이터를 교차검증을 수행</li>
<li>교차검증을 통해 최적화되면 반복을 중단하는 조기 중단 기능을 가지고 있음</li>
</ul>
</br>
파라미터 종류
- nthread : CPU의 실행 스레드 개수를 조정. 디폴트는 CPU의 전체 스레드를 사용하는 것
- eta : GBM 학습률
- num_boost_rounds : n_estimators와 같은 파라미터
- max_depth

<pre><code class="language-py">!pip install xgboost</code></pre>
<pre><code class="language-py">from xgboost import XGBClassifier

start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)

# numpy array 값을 받아들이기 때문에, values 값만 넣어야 한다.
xgb.fit(X_train.values, y_train)
print(&#39;Fit time : &#39;, time.time() - start_time)

# 289.586 나옴</code></pre>
<pre><code class="language-py">accuracy_score(y_test, grid.best_estimator_.predict(X_test.values))
#0.9392 나옴</code></pre>
<pre><code class="language-py"># 조기 종료 설정 (early_stopping_round)

from xgboost import XGBClassifier

evals = [(X_test.values, y_test)]

start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)

# numpy array 값을 받아들이기 때문에, values 값만 넣어야 한다.
# early_stopping_rounds=10 : 같은 성능으로 10번 이상 비슷한 값이 나오면 종료 해라
xgb.fit(X_train.values, y_train, early_stopping_rounds=10, eval_set=evals)
print(&#39;Fit time : &#39;, time.time() - start_time)
</code></pre>
<h1 id="lightgbm">LightGBM</h1>
<hr>
<ul>
<li>LightGBM은 XGBoost와 함께 부스팅 계열에서 가장 각광받는 알고리즘</li>
<li>LGBM의 큰 장점은 속도</li>
<li>단, 적은 수의 데이터에는 어울리지 않음 (일반적으로 10000건 이상의 데이터가 필요하다고 함)</li>
<li>GPU 버전도 존재함</li>
</ul>
<pre><code class="language-py">!pip install lightgbm</code></pre>
<pre><code class="language-py">start_time = time.time()</code></pre>
<pre><code class="language-py">from lightgbm import LGBMClassifier
import time

evals = [(X_test.values, y_test)]

start_time = time.time()
lgbm = LGBMClassifier(n_estimators=400)
lgbm.fit(X_train.values, y_train,eval_set=evals)
print(&#39;Fit time : &#39;, time.time() - start_time)</code></pre>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 12. kNN (k Nearest Neighber)]]></title>
            <link>https://velog.io/@jaam_mini/ML-11.-kNN-k-Nearest-Neighber</link>
            <guid>https://velog.io/@jaam_mini/ML-11.-kNN-k-Nearest-Neighber</guid>
            <pubDate>Tue, 30 Jan 2024 05:36:00 GMT</pubDate>
            <description><![CDATA[<h3 id="knn-이란">kNN 이란?</h3>
<hr>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/495cbe54-1ccd-40d5-856e-6a3422e8abcc/image.png" alt=""></p>
<ul>
<li>새로운 데이터가 있을 때, 기존 데이터의 그룹 중 어떤 그룹에 속하는지 분류하는 문제</li>
<li>k는 몇 번째 가까운 데이터까지 볼 것인가를 정하는 수치 </br></li>
<li>즉, 쉽게 말해 새로운 데이터(검은점)이 빨강-파랑 중 어디로 분류 되는지 정하는 것</br></li>
<li>더 간단히 말해, K값을 설정하고, 그 값에 가까이 있는 애로 분류할게~
</br></br></li>
</ul>
<p>더 자세히 볼까요???
N은 파랑과 녹색 중 어디 일까?
<img src="https://velog.velcdn.com/images/jaam_mini/post/b7a4f829-f321-4ccc-8549-067e2526159d/image.png" alt=""></p>
<ul>
<li><p>2번째 거리에 가깝게 설정 ; 세모 그룹
<img src="https://velog.velcdn.com/images/jaam_mini/post/b67f3d2b-2603-4839-b937-264daafbf170/image.png" alt=""></p>
</li>
<li><p>3번째 거리에 가깝게 설정 ; 동그라미 그룹
<img src="https://velog.velcdn.com/images/jaam_mini/post/a0408ee8-9a18-4e48-9f5a-eb2989480ab1/image.png" alt=""></p>
</li>
</ul>
<h3 id="💡따라서-k값거리는-표준화-해주는-것이-상당히-중요하다">💡따라서 k값(거리)는 표준화!! 해주는 것이 상당히 중요하다</h3>
<p></br></br></p>
<h3 id="knn-장단점">kNN 장단점</h3>
<hr>
<ul>
<li>실시간 예측을 위한 학습이 필요치 않다</li>
<li>결국  속도가  빨라진다</li>
<li>고차원  데이터에는  적합하지  않다</li>
</ul>
<p></br></br></br></p>
<h3 id="실습-_-iris">실습 _ iris</h3>
<hr>
<pre><code>from sklearn.datasets import load_iris
iris = load_iris()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=13, stratify=iris.target
)</code></pre><pre><code># 1) kNN 학습

from sklearn.neighbors import KNeighborsClassifier

# n_neighbors= : 몇개 까지 가까운걸 찾을래?
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)</code></pre><pre><code># 2) accuracy 확인

from sklearn.metrics import accuracy_score

pred = knn.predict(X_test)
print(accuracy_score(y_test, pred))</code></pre><p>0.9666666666666667</p>
<pre><code># 3) 간단한 성과 (?)

from sklearn.metrics import (classification_report, confusion_matrix)

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/a273de0c-4a61-4455-9a0f-181be060bece/image.png" alt=""></p>
<p>간단한 ㄷ이터를 다룰 때 kNN은 큰 두각을 나타내지 못합니다...
다음 언젠가 실습 시 두각을 나타내는 결과를 보길 기대하며..</p>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 11. 앙상블기법_Boosting Algorithm (기초)]]></title>
            <link>https://velog.io/@jaam_mini/ML-11.-%EC%95%99%EC%83%81%EB%B8%94%EA%B8%B0%EB%B2%95Boosting-Algorithm</link>
            <guid>https://velog.io/@jaam_mini/ML-11.-%EC%95%99%EC%83%81%EB%B8%94%EA%B8%B0%EB%B2%95Boosting-Algorithm</guid>
            <pubDate>Tue, 30 Jan 2024 03:48:48 GMT</pubDate>
            <description><![CDATA[<h3 id="앙상블기법">앙상블기법</h3>
<hr>
<ul>
<li>앙상블은 전통적으로 Voting, Boosting, Bagging, 스태깅 으로 나뉨</br></li>
<li>보팅과 배깅은여러개의 분류기가 투표를 통해 최종 예측 결과를 결정하는 방식이다</li>
<li>둘의 차이점은 보팅은 각각 다른 분류기, 배깅은 같은 분류기를 사용</li>
<li>대표적인 Bagging 방식은 landomforest 이다.<ul>
<li>Voting<ul>
<li>하나의 데이터(전체 데이터)를 다 쓰면서 각각 다른 알고리즘을 적용 시키는 방법</li>
</ul>
</li>
<li>Bagging<ul>
<li>하나의 알고리즘을 쓰는데, 전체 데이터를 나눠서 씀</li>
<li>나누는 방법 ; 
중복을 허락해서 데이터를 수집 한다 (boot strapping)</li>
</ul>
</li>
</ul>
</li>
</ul>
</br>

<p>그렇다면, Boosting은 뭘까..?
알아봅시다 😎</p>
<h3 id="boosting-이란">Boosting 이란?</h3>
<hr>
<ul>
<li>여러개의 <code>약한(?)분류기</code>(=성능이 떨어지고 겁나 빠른 = DecisionTree, maxdepth=2_낮게 주는 것)가 순차적으로 학습하면서 앞에서 학습한 분류기가 예측이 <code>틀린 데이터</code>에 대해 <code>다음 분류기</code>가 가중치를 인가해 <code>학습을 이어 진행</code>하는 방식</br></li>
<li>예측 성능이 뛰어나 앙상블 학습을 주도함</br>

</li>
</ul>
<h3 id="boosting-기법-3가지">Boosting 기법 3가지</h3>
<hr>
<ol>
<li>GBM (Gradient Boosting Machine)<ul>
<li>AdaBoost 기법과 비슷하지만 가중치를 업데이트 할때 <code>경사하깅법(Gradient Descent)</code>를 사용</li>
</ul>
</li>
</ol>
</br>

<ol start="2">
<li>XGBoost (eXtra Gradient Boost)<ul>
<li>GBM에서 PC의 파워를 효율적으로 사용하기 위한 다양한 기법에 채택되 빠른 속도와 효율을 가짐</li>
<li>GBM에서 효율을 극도로 올리고 CPU를 쓸 수 있게 하는 것</li>
</ul>
</li>
</ol>
</br>

<ol start="3">
<li>LightGBM (Light Gradient Boost)<ul>
<li>XGBoost 보다 빠른 속도를 가짐</li>
<li>속도를 향상시ㅣ기 위한 각종 장치들이 있음</li>
</ul>
</li>
</ol>
<p></br></br></p>
<h3 id="bagging과-boosting-의-차이는">Bagging과 Boosting 의 차이는?</h3>
<hr>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/276ae77c-6bc8-4f6c-995d-aaeb29ce9aaa/image.png" alt=""></p>
<ul>
<li>Bagging
<img src="https://velog.velcdn.com/images/jaam_mini/post/ff119e63-3d2c-461b-b15a-c23cde51e420/image.png" alt=""><ul>
<li>데이터를 통 or 잘라서 쓰던지 <code>학습하는 타이밍</code>이 <code>동시</code>에 이뤄짐
(= 한번에 병렬적으로 결과를 얻음)</li>
<li>데이터들이 각각의 분류기에 들어가고 각각의 모델들이 동시에 학습을 해서 결과를 투표</li>
</ul>
</li>
</ul>
<ul>
<li>Boosting
<img src="https://velog.velcdn.com/images/jaam_mini/post/11d0b1aa-42e8-4d9b-a5c3-b15d84502609/image.png" alt=""><ul>
<li>데이터를 가지고 학습</li>
<li>그 결과(틀린것, 가중치가 필요한 것들)를 가지고 또 학습</li>
<li>또 학습
(= 순차적으로 진행됨)</li>
</ul>
</li>
</ul>
<p></br></br></br></p>
<p>Boosting 계열의 기본적인 그림(AdaBoost)을 통해 알아봅시다~</p>
<h3 id="boosting-계열-개념-설명">Boosting 계열 개념 설명</h3>
<hr>
<ul>
<li>D1
먼저 +-를 구분해야 하는데, 매우 약한 분류기를 썼기 때문에 성능의 경계면이 말도 안되게 설정됨...
<img src="https://velog.velcdn.com/images/jaam_mini/post/d1748955-8339-4a58-afe0-8c71ea44899c/image.png" alt=""></li>
</ul>
<ul>
<li>D2
그리고 틀린 아이들에게 <code>가중치</code>를 줌
<img src="https://velog.velcdn.com/images/jaam_mini/post/ddbefe56-744b-4511-a016-2d92b8457d3d/image.png" alt=""></li>
</ul>
<ul>
<li>D3
다시 놓친 -에 가중치를 인가해, 다시 경계를 설정
<img src="https://velog.velcdn.com/images/jaam_mini/post/3fca3bd5-e4c4-4d38-a742-6ef7e2fd30d5/image.png" alt=""></li>
</ul>
<ul>
<li>마지막 단계
앞서 결정한 경계들을 합침 (이어붙임)
<img src="https://velog.velcdn.com/images/jaam_mini/post/0a6844ea-485e-4a49-a393-2a59e4180681/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h2 id="실습---wine-data">실습 - Wine data</h2>
<hr>
<h3 id="1-데이터-확인">1. 데이터 확인</h3>
<pre><code class="language-py"># 1) 데이터 가져오기
import pandas as pd
wine_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv&#39;
wine = pd.read_csv(wine_url,index_col=0)
wine.head()

# 2) 맛 등급 설정
    # (1) quality 컬럼 이진화
    # wine 데이터의 [&#39;taste&#39;] 컬럼 생성
    # wine의 quality column울 grade로 잡고, 5등급 보다 크면 1, 그게 아니라면 0으로 잡음
wine[&#39;taste&#39;] = [1. if grade&gt;5 else 0. for grade in wine[&#39;quality&#39;]]
    # (2) 모델링
    # label인 taste, quality를 drop, 나머지를 X의 특성으로 봄
X = wine.drop([&#39;taste&#39;, &#39;quality&#39;], axis=1)
# 새로만들 y데이터
y = wine[&#39;taste&#39;]</code></pre>
<pre><code class="language-py"># 3) StandardScaler
from sklearn.preprocessing import StandardScaler
# StandardScaler를 installation
sc = StandardScaler()
# X 데이터를 StandardScaler로 변환
X_sc = sc.fit_transform(X)

# 4) 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)</code></pre>
<pre><code class="language-py"># 5) 히스토그램
import matplotlib.pyplot as plt
%matplotlib inline

wine.hist(bins=10, figsize=(15,15))
plt.show()</code></pre>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/8821e660-36c1-4070-9082-62479ddc0341/image.png" alt=""></p>
<pre><code class="language-py">wine.columns.values</code></pre>
<p>array([&#39;fixed acidity&#39;, &#39;volatile acidity&#39;, &#39;citric acid&#39;,
       &#39;residual sugar&#39;, &#39;chlorides&#39;, &#39;free sulfur dioxide&#39;,
       &#39;total sulfur dioxide&#39;, &#39;density&#39;, &#39;pH&#39;, &#39;sulphates&#39;, &#39;alcohol&#39;,
       &#39;quality&#39;, &#39;color&#39;, &#39;taste&#39;], dtype=object)</p>
<pre><code class="language-py"># 6) quality 별 어떤 특성이 있는지 확인

column_names = [&#39;fixed acidity&#39;, &#39;volatile acidity&#39;, &#39;citric acid&#39;,
       &#39;residual sugar&#39;, &#39;chlorides&#39;, &#39;free sulfur dioxide&#39;,
       &#39;total sulfur dioxide&#39;, &#39;density&#39;, &#39;pH&#39;, &#39;sulphates&#39;, &#39;alcohol&#39;]

df_pivot_table = wine.pivot_table(column_names, [&#39;quality&#39;], aggfunc=&#39;median&#39;)
print(df_pivot_table)</code></pre>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/f8fd6194-bcdd-4940-a20a-0bec05a701f8/image.png" alt=""></p>
<pre><code class="language-py"># 7) quality 대한 나머지 특성들의 상관관계
# (주의사항) : 상관관계를 sort_values로 볼때, |절대값|으로 값을 생각해야 함, -라고 안좋은게 아님. 

corr_matrix = wine.corr()
corr_matrix[&#39;quality&#39;].sort_values(ascending=False)</code></pre>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/ed93632d-d52c-4d5d-ac18-8e53391f9b3a/image.png" alt=""></p>
<h3 id="2-train">2. train</h3>
<p>⭐오늘의 keypoint</p>
<pre><code class="language-py"># 8) 다양한 모델을 한번에 테스트

# ensemble(앙상블 기법) 에서 3가지 분류기 사용
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
# 각 분류기 import
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# 빈 리스트 만들고
models = []
# 전부 append 시켜줌 (이름, 분류기 함수()) - 리스트으로 저장(리스트는 뭐든 들어가기 때문)
models.append((&#39;RandomForestClassifier&#39;, RandomForestClassifier()))
models.append((&#39;DecisionTreeClassifier&#39;, DecisionTreeClassifier()))
models.append((&#39;AdaBoostClassifier&#39;, AdaBoostClassifier()))
models.append((&#39;GradientBoostingClassifier&#39;, GradientBoostingClassifier()))
models.append((&#39;LogisticRegression&#39;, LogisticRegression(solver=&#39;liblinear&#39;)))</code></pre>
<pre><code class="language-py">models</code></pre>
<p>[(&#39;RandomForestClassifier&#39;, RandomForestClassifier()),
 (&#39;DecisionTreeClassifier&#39;, DecisionTreeClassifier()),
 (&#39;AdaBoostClassifier&#39;, AdaBoostClassifier()),
 (&#39;GradientBoostingClassifier&#39;, GradientBoostingClassifier()),
 (&#39;LogisticRegression&#39;, LogisticRegression(solver=&#39;liblinear&#39;))]</p>
<pre><code class="language-py"># 9) 각 분류기별 models 결과를  저장하기  위한  작업

%time
# 러닝시간 측정
# CPU times: total: 0 ns
# Wall time: 0 ns

from sklearn.model_selection import KFold, cross_val_score

results = []
names = []

# models 는 이미 리스트 안에 튜플로 되어 있음 (위에 쿼리)
# 그렇기 때문에 name 과 model로 받을 수 있음
for name, model in models:
    # kfold 선언 = (5겹 폴딩, - , 5개로 나누기 전에 데이터를 썪어라)
    kfold = KFold(n_splits=5, random_state=13, shuffle=True)
    # 5개의 model 마다 X_train, y_train 데이터로 kfolding(cv=kfold) 시킴
    cv_results = cross_val_score(model, X_train, y_train,
                                 cv=kfold, scoring=&#39;accuracy&#39;)
    results.append(cv_results)
    names.append(name)

    print(name, cv_results.mean(), cv_results.std())

# 결과 : results 변수에는 5개의 알고리즘 성증들이 저장되어 있음
# cv_results.mean() : training data를 5겹으로 나눈 mean(평균값)
</code></pre>
<p>CPU times: total: 0 ns
Wall time: 0 ns
RandomForestClassifier 0.8235476049455839 0.014660814747173595
DecisionTreeClassifier 0.7548571111275635 0.007232581517245795
AdaBoostClassifier 0.7533103205745169 0.02644765901536818
GradientBoostingClassifier 0.7663961279336641 0.02129278386035166
LogisticRegression 0.7423482268453395 0.014274628192480914</p>
<pre><code class="language-py">results
# 5개 알고리즘 마다 5번 폴딩했을 때 결과값(=성능)</code></pre>
<p>[array([0.82019231, 0.85      , 0.80846968, 0.8267565 , 0.81231954]),
 array([0.75192308, 0.76538462, 0.74879692, 0.76130895, 0.74687199]),
 array([0.74903846, 0.80384615, 0.72666025, 0.74687199, 0.74013474]),
 array([0.77019231, 0.80192308, 0.73820982, 0.76900866, 0.75264678]),
 array([0.73269231, 0.76826923, 0.74013474, 0.7439846 , 0.72666025])]</p>
<pre><code class="language-py">names
# results 항목명</code></pre>
<p>[&#39;RandomForestClassifier&#39;,
 &#39;DecisionTreeClassifier&#39;,
 &#39;AdaBoostClassifier&#39;,
 &#39;GradientBoostingClassifier&#39;,
 &#39;LogisticRegression&#39;]</p>
<pre><code class="language-py"># 10) croocross-validation 결과를  일목요연하게  확인하기
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(12, 5))
fig.suptitle(&#39;Algorithm Comparison&#39;)
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()</code></pre>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/0bcf9ed2-8c9a-47a4-87a5-6692e4c95040/image.png" alt=""></p>
<h3 id="3-test">3. test</h3>
<pre><code class="language-py"># 11) 테스트  데이터에  대한  평가  결과
from sklearn.metrics import accuracy_score

for name, model in models:
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(name, accuracy_score(y_test, pred))</code></pre>
<p>RandomForestClassifier 0.8392307692307692
DecisionTreeClassifier 0.7838461538461539
AdaBoostClassifier 0.7553846153846154
GradientBoostingClassifier 0.7884615384615384
LogisticRegression 0.7446153846153846</p>
]]></description>
        </item>
        <item>
            <title><![CDATA[[EDA] mini project 12 _ 세계 테러 분석 ]]></title>
            <link>https://velog.io/@jaam_mini/EDA-mini-project-12-%EC%84%B8%EA%B3%84-%ED%85%8C%EB%9F%AC-%EB%B6%84%EC%84%9D</link>
            <guid>https://velog.io/@jaam_mini/EDA-mini-project-12-%EC%84%B8%EA%B3%84-%ED%85%8C%EB%9F%AC-%EB%B6%84%EC%84%9D</guid>
            <pubDate>Sun, 28 Jan 2024 17:12:46 GMT</pubDate>
            <description><![CDATA[<h1 id="전처리">전처리</h1>
<hr>
<p>데이터 불러오기</p>
<pre><code>import pandas as pd

raw_data = pd.read_csv(&#39;./globalterrorismdb_0718dist.csv&#39;, encoding = &#39;latin-1&#39;)
raw_data.head()</code></pre><p>모든 컬럼 확인</p>
<pre><code>raw_data.columns.values</code></pre><p>컬럼_데이터 확인</p>
<pre><code>raw_data[&#39;summary&#39;].values</code></pre><p>보기 좋게 컬럼명 변경</p>
<pre><code># 필요한 컬럼들 : 날짜, 이슈, 국가, 사망자수, 부상자수, 지역구분, 공격형태(테러양상)

terr_df = raw_data.copy()

terr_df.rename(columns={
    &#39;eventid&#39;:&#39;eventid&#39;, &#39;iyear&#39;:&#39;Year&#39;,&#39;imonth&#39;:&#39;Month&#39;,&#39;iday&#39;:&quot;day&quot;,
    &#39;country_txt&#39;:&#39;Country&#39;,&#39;region_txt&#39;:&#39;Region&#39;,&#39;provstate&#39;:&#39;State&#39;,&#39;city&#39;:&#39;City&#39;,
    &#39;latitude&#39;:&#39;lat&#39;,  &#39;longitude&#39;:&#39;lng&#39;,
    &#39;targtype1_txt&#39;:&#39;Targettype&#39;,&#39;attacktype1_txt&#39;:&#39;Attacktype&#39;,&#39;weaptype1_txt&#39;:&#39;Weapon&#39;,
    &#39;nkill&#39;:&#39;Kill&#39;,&#39;nwound&#39;:&#39;Wound&#39;,
    &#39;gname&#39;:&#39;Group&#39;,&#39;summary&#39;:&#39;Summary&#39;,&#39;motive&#39;:&#39;Motive&#39;,
}, inplace=True)

terr_df.reset_index()
terr_df.tail(2)</code></pre><p>사용할 컬럼으로 변경</p>
<pre><code>terr_df = terr_df[[
    &#39;eventid&#39;, &#39;Year&#39;, &#39;Month&#39;, &#39;day&#39;, 
    &#39;Country&#39;, &#39;Region&#39;, &#39;State&#39;, &#39;City&#39;, 
    &#39;lat&#39;, &#39;lng&#39;,
    &#39;Targettype&#39;, &#39;Attacktype&#39;, &#39;Weapon&#39;, 
    &#39;Kill&#39;, &#39;Wound&#39;, &#39;Group&#39;, &#39;Summary&#39;, &#39;Motive&#39;
]]</code></pre><p>비어 있는 데이터 확인 :  isnull().sum()</p>
<pre><code>terr_df.isnull().sum()</code></pre><p>데이터 타입 확인</p>
<pre><code>terr_df.info()</code></pre><p></br></br></br></p>
<h1 id="분석-시작">분석 시작!</h1>
<hr>
<pre><code># 연도 컬럼에 몇해연도가 있는지 확인
year = terr_df[&#39;Year&#39;].unique()
year</code></pre><pre><code># 그래프(시각화) 전에 각 연도별 데이터 수 확인
year_count = terr_df[&#39;Year&#39;].value_counts(dropna=False).sort_index()
year_count[:4]</code></pre><p>연간 테러 발생 건수 </p>
<pre><code>import matplotlib.pyplot as plt
import seaborn as sns

year = terr_df[&#39;Year&#39;].unique()
year_count = terr_df[&#39;Year&#39;].value_counts(dropna=False).sort_index()


plt.figure(figsize=(12, 4))
ax = sns.barplot(x=year, y=year_count, palette=&#39;YlOrBr&#39;)
for p in ax.patches:
    ax.annotate(f&#39;{p.get_height()}&#39;, (p.get_x() + p.get_width() / 2., p.get_height()),
                ha=&#39;center&#39;, va=&#39;baseline&#39;, fontsize=10, color=&#39;black&#39;, xytext=(0, 5),
                textcoords=&#39;offset points&#39;,rotation = 90)
plt.xlabel(&#39;Attack Year&#39;)
plt.xticks(rotation=50, fontsize=7)
plt.ylabel(&#39;Number of attacks cases&#39;)
plt.title(&#39;Attacks In Years&#39;, fontsize=15)
plt.show()

# 1번 방법
# # 어떤 데이터를?
# year = terr_df[&#39;Year&#39;].unique()
# year_count = terr_df[&#39;Year&#39;].value_counts(dropna=False).sort_index()

# # 그래프 (쿼리 순서 중요)
# plt.figure(figsize=(12, 4))
# sns.barplot(x=year, y=year_count, palette=&#39;YlOrBr&#39;)
# plt.xlabel(&#39;Attack Year&#39;)
# plt.xticks(rotation=50, fontsize=7)
# plt.ylabel(&#39;Number of attacks cases&#39;)
# plt.title(&#39;Attacks In Years&#39;, fontsize=15)
# plt.show()


# 2번 방법
# # sns.countplot 사용
# plt.figure(figsize = (15,5))
# sns.countplot(x=&#39;Year&#39;,data=terr_df)
# plt.xticks(rotation=90)
# plt.xlabel(&#39;year&#39;, fontsize=10)
# plt.ylabel(&#39;counts&#39;, fontsize=10)
# plt.title(&#39;Number of terrorist activites each year&#39;, fontsize=15)
# plt.show()

# 3번 방법
# sns.countplot + counts text 추가
# plt.figure(figsize = (15,5))
# ax = sns.countplot(x=&#39;Year&#39;,data=terr_df)
# plt.xticks(rotation=90)
# # Adding annotations to the chart
# for p in ax.patches:
#     ax.annotate(f&#39;{p.get_height()}&#39;, (p.get_x() + p.get_width() / 2., p.get_height()),
#                 ha=&#39;center&#39;, va=&#39;baseline&#39;, fontsize=10, color=&#39;black&#39;, xytext=(0, 5),
#                 textcoords=&#39;offset points&#39;,rotation = 90)

# plt.title(&#39;Attacks In Years&#39;)
# plt.show</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/ddd3e361-95fe-4501-b404-211cfb9cfc8a/image.png" alt=""></p>
<p>테러발생 상위 10개국</p>
<pre><code>terr_rank_10 = terr_df[&#39;Country&#39;].value_counts()[:10]
terr_rank_10</code></pre><pre><code>terr_counts = terr_df.Country.value_counts()[:10].unique()
terr_counts</code></pre><pre><code>terr_rank = terr_df[&#39;Country&#39;].value_counts()[:10].index
terr_rank</code></pre><pre><code>plt.figure(figsize=(12, 4))
top_10_country = terr_df[&#39;Country&#39;].value_counts().head(10)

# Remove &#39;Unknown&#39;
# top_10_cities = top_10_cities[top_10_cities.index != &#39;Unknown&#39;]

sns.barplot(x=top_10_country.index, y=top_10_country.values, palette=&#39;rocket&#39;)
plt.title(&#39;Most attacks Country Top10&#39;)
plt.xlabel(&#39;Country&#39;)
plt.ylabel(&#39;Counts&#39;)
plt.xticks(rotation=30)

plt.show()

# 또 다른 방법
# terr_rank = terr_df[&#39;Country&#39;].value_counts()[:10].index
# terr_counts = terr_df.Country.value_counts()[:10].unique()

# plt.figure(figsize=(12, 4))
# sns.barplot(x=terr_rank, y=terr_counts, palette=&#39;YlOrBr_r&#39;)

# plt.xlabel(&#39;Countries&#39;)
# plt.xticks(fontsize=7)
# plt.ylabel(&#39;Count&#39;)
# plt.title(&#39;Most attacks Country Top10&#39;)
# plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/0abe38a9-0031-4a0d-8962-3d90251a88e2/image.png" alt=""></p>
<pre><code>fig,ax = plt.subplots(figsize=(12,4))
ax = sns.barplot(x=terr_df.Country.value_counts()[:10].values,y = terr_df.Country.value_counts()[:10].index, palette=&#39;RdYlGn&#39;)
ax.set_title(&#39;Most attacks Country Top10&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/64547440-fc31-4f80-a617-48b39d8b4c3b/image.png" alt=""></p>
<pre><code># columns [&#39;Country&#39;,&#39;Terror_cases&#39;]으로 상위 10개국 DataFrame 만들기
terr_rank_10 = terr_df.groupby(&#39;Country&#39;).size().to_frame(name=&#39;Terror_cases&#39;)
terr_rank_10.sort_values(&#39;Terror_cases&#39;, ascending=False, inplace=True)
terr_rank_10 = terr_rank_10.head(10).reset_index()
terr_rank_10</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/623dc100-3f0e-45dd-b03e-c9be868a9fb6/image.png" alt=""></p>
<p>사상자 상위 10개국</p>
<pre><code>coun_terror=terr_df[&#39;Country&#39;].value_counts()[:10].to_frame()
coun_terror.columns=[&#39;Wound&#39;]
coun_kill=terr_df.groupby(&#39;Country&#39;)[&#39;Kill&#39;].sum().to_frame()
coun_terror.merge(coun_kill,left_index=True,right_index=True,how=&#39;left&#39;).plot.bar()
fig=plt.gcf()
fig.set_size_inches(12,4)
plt.xticks(rotation=0)
plt.title(&#39;Wound &amp; Kill County Top10 &#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/fdf0434d-bd36-40c5-ac82-0e90de551eab/image.png" alt=""></p>
<p>지역별 테러 특성 확인</p>
<pre><code>resion_counts = terr_df[&#39;Region&#39;].value_counts()

fig = plt.figure(figsize=(12,8))

plt.pie(
    resion_counts,
    labels=None,
    autopct=&#39;%.1f%%&#39;,
    startangle=90,
    textprops={&#39;fontsize&#39;:10}
)

centre_circle = plt.Circle((0,0), 0.4, fc=&#39;white&#39;)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.axis(&#39;equal&#39;)
plt.title(&#39;Terrorist attack by Region&#39;)
plt.legend(resion_counts.index, loc=&#39;center left&#39;, bbox_to_anchor=(1, 0.5))
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/1058ff6c-1e6c-4c16-ab17-4a791834d3c9/image.png" alt=""></p>
<pre><code>#  df.loc[ &#39;행이름&#39;:&#39;행이름&#39;, &#39;열이름&#39;: &#39;열이름&#39;]
#  df.iloc[ 행번호:행번호, 열번호:열번호]
#  .unstack(fill_value=0) : 데이터 프레임으로 만들고 측정 결측치를 0 값으로 지정
region_year_counts = terr_df.groupby([&#39;Region&#39;, &#39;Year&#39;]).size().unstack(fill_value=0)</code></pre><pre><code>region_year_counts.index</code></pre><pre><code>region_year_counts.columns</code></pre><pre><code>pd.crosstab(terr_df.Year,terr_df.Region).plot(figsize=(12,4))
plt.title(&#39;Terrorist Attack By Region&#39;,size=10)
plt.ylabel(&#39;counts&#39;)

# region_year_counts = terr_df.groupby([&#39;Region&#39;, &#39;Year&#39;]).size().unstack(fill_value=0)

# plt.figure(figsize=(12, 4))

# for region in region_year_counts.index:
#     plt.plot(region_year_counts.columns, region_year_counts.loc[region], label=region)

# plt.title(&#39;Terrorist attack by Region&#39;)
# plt.xlabel(&#39;Year&#39;)
# plt.ylabel(&#39;counts&#39;)
# plt.legend(loc=&#39;upper left&#39;)
# plt.grid(True)

# plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/55866441-13c4-4780-b9e4-29cfe523d87d/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(12,4))
sns.countplot(x = terr_df[&#39;Region&#39;], order = terr_df[&#39;Region&#39;].value_counts().index)
plt.xticks(rotation=30, fontsize=8)
plt.xlabel(&#39;region&#39;)
plt.title(&#39;counts&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/b63bfc7e-ed18-4602-94bd-7203aee2c948/image.png" alt=""></p>
<pre><code>pd.crosstab(terr_df.Region,terr_df.Attacktype).plot.barh(stacked=True)
fig=plt.gcf()
fig.set_size_inches(12,7)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/ebe3e141-cf5a-456f-b38f-74b979febff0/image.png" alt=""></p>
<pre><code>coun_terror=terr_df[&#39;Region&#39;].value_counts().to_frame()
coun_terror.columns=[&#39;Wound&#39;]
coun_kill=terr_df.groupby(&#39;Region&#39;)[&#39;Kill&#39;].sum().to_frame()
coun_terror.merge(coun_kill,left_index=True,right_index=True,how=&#39;left&#39;).plot.bar()
fig=plt.gcf()
fig.set_size_inches(18,6)
plt.xticks(rotation=30, fontsize=8)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/a706599c-a190-418c-a2b5-2cca2ad7bde0/image.png" alt=""></p>
<p>연도별 테러 양상 분석</p>
<pre><code>import folium
import pandas as pd
import json
from folium.plugins import MarkerCluster 

terr_topYear = terr_df[&#39;Year&#39;] == 2014
filterData = terr_df[terr_topYear] # filter data

# filterData.info()
filterData_info = filterData.loc[:,&#39;City&#39;:&#39;lng&#39;] #We are getting the required fields
filterData_info = filterData_info.dropna() # drop NaN values in latitude and longitude
filterData_list = filterData_info.values.tolist()

# reqFilterDataList
map = folium.Map(location = [0, 30], tiles=&#39;CartoDB positron&#39;, zoom_start=2)

# clustered marker
markerCluster = folium.plugins.MarkerCluster().add_to(map)
for point in range(0, len(filterData_list)):
    folium.Marker(location=[filterData_list[point][1],filterData_list[point][2]],
                  popup = filterData_list[point][0]).add_to(markerCluster)
map</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/e66df52d-471b-44dd-855e-2c4c1c3d97f7/image.png" alt=""></p>
<pre><code>coun_terror=terr_topRate[&#39;Region&#39;].value_counts().to_frame()
coun_terror.columns=[&#39;Wound&#39;]
coun_kill=terr_topRate.groupby(&#39;Region&#39;)[&#39;Kill&#39;].sum().to_frame()
coun_terror.merge(coun_kill,left_index=True,right_index=True,how=&#39;left&#39;).plot.bar()
fig=plt.gcf()
fig.set_size_inches(18,6)
plt.xticks(rotation=30, fontsize=8)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/f95928c9-0ce5-4550-ac65-6ca23910f78a/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(12, 4))
terr_topRate = terr_topRate[&#39;Country&#39;].value_counts().head(10)

# Remove &#39;Unknown&#39;
# top_10_cities = top_10_cities[top_10_cities.index != &#39;Unknown&#39;]

sns.barplot(x=terr_topRate.index, y=terr_topRate.values, palette=&#39;rocket&#39;)
plt.title(&#39;Most attacks Country Top10&#39;)
plt.xlabel(&#39;Country&#39;)
plt.ylabel(&#39;Counts&#39;)
plt.xticks(rotation=30)

plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/cea76560-9a50-45e0-9276-a6986e014c8a/image.png" alt=""></p>
<pre><code>terr_topRate = terr_df.copy()
terr_topRate = terr_topRate.loc[(terr_topRate[&#39;Year&#39;]==2012)|(terr_topRate[&#39;Year&#39;]==2013)|(terr_topRate[&#39;Year&#39;]==2014)]

plt.figure(figsize=(12, 4))
top_5_weapon_types = terr_topRate[&#39;Weapon&#39;].value_counts().head()

sns.barplot(x=top_5_weapon_types.index, y=top_5_weapon_types.values, palette=&#39;flare&#39;)
plt.title(&#39;Top 5 Most Used Weapon Types&#39;, fontsize=10)
plt.xlabel(&#39;Weapon Types&#39;)
plt.ylabel(&#39;counts&#39;)
plt.show()</code></pre><pre><code>![](https://velog.velcdn.com/images/jaam_mini/post/738a0a6a-6060-4ef8-a3a5-7035eb311324/image.png)

terr_topRate = terr_df.copy()
terr_topRate = terr_topRate.loc[(terr_topRate[&#39;Year&#39;]==2012)|(terr_topRate[&#39;Year&#39;]==2013)|(terr_topRate[&#39;Year&#39;]==2014)]

coun_terror=terr_topRate[&#39;Country&#39;].value_counts()[:10].to_frame()
coun_terror.columns=[&#39;Attacks&#39;]
coun_kill=terr_topRate.groupby(&#39;Country&#39;)[&#39;Kill&#39;].sum().to_frame()
coun_terror.merge(coun_kill,left_index=True,right_index=True,how=&#39;left&#39;).plot.bar()
fig=plt.gcf()
fig.set_size_inches(12,4)
plt.title(&#39;Attacks &amp; Killed (2012-2014)&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/7296e15b-e4c4-4774-b1db-70171fd50567/image.png" alt=""></p>
<pre><code>fig,ax = plt.subplots(figsize=(12,4))
# Unkown 삭제를 위해 _counts()[1:10]
ax = sns.barplot(x=terr_topRate.Group.value_counts()[1:10].values,y = terr_topRate.Group.value_counts()[1:10].index, palette=&#39;mako&#39;)
ax.set_title(&#39;Terrorist Groups with Highest Terror Attacks&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/030726de-2b48-4aba-a16c-fb594f8cd054/image.png" alt=""></p>
<pre><code>df_Iraq = terr_topRate[terr_topRate[&#39;Country&#39;] == &#39;Iraq&#39;]

fig,ax = plt.subplots(figsize=(12,4))
ax = sns.barplot(x=df_Iraq.Group.value_counts()[1:6].values,y = df_Iraq.Group.value_counts()[1:6].index, palette=&#39;Blues&#39;)
ax.set_title(&#39;Terrorist Groups, Iraq (2012-2014)&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/f09190ef-a6df-4316-81fd-fc04644a508a/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(4, 2))
sns.barplot(x=df_Iraq_Bag[&#39;Year&#39;].value_counts().index, y=df_Iraq_Bag[&#39;Year&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Terror Attacks by ISIL, Iraq (2012-2014)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/476d602b-9ccb-4d86-b2ad-a3f52244bba6/image.png" alt=""></p>
<pre><code>df_Iraq[&#39;City&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;lightblue&#39;)
plt.xlabel(&quot;City&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 most effected city in Iraq(2012-2014)&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/38494f2c-deee-4383-b5e0-816fed0f1f33/image.png" alt=""></p>
<pre><code>df_Iraq[&#39;Attacktype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;steelblue&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Attacktype&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 5 Attacktype, Iraq (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/359d025f-87a1-4cf3-aeef-06bf106c9bc7/image.png" alt=""></p>
<pre><code>df_Iraq[&#39;Targettype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;steelblue&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Targettype&quot;)
plt.ylabel(&quot;count&quot;)
plt.title(&quot;Top 5 Targettype, Iraq (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/39c48bb4-d342-490c-b78d-da9255e0de53/image.png" alt=""></p>
<pre><code>df_Af = terr_topRate[terr_topRate[&#39;Country&#39;] == &#39;Afghanistan&#39;]

fig,ax = plt.subplots(figsize=(12,4))
ax = sns.barplot(x=df_Af.Group.value_counts()[:5].values,y = df_Af.Group.value_counts()[:5].index, palette=&#39;crest&#39;)
ax.set_title(&#39;Terrorist Groups, Afghanistan (2012-2014)&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/7a7d5af8-647c-4e88-ab72-638c1726dc4d/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(4, 2))
sns.barplot(x=df_Af_Ta[&#39;Year&#39;].value_counts().index, y=df_Af_Ta[&#39;Year&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Terror Attacks by Taliban, Afghanistan (2012-2014)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/139cf8fb-d30a-4795-9646-aa85f8bedb3d/image.png" alt=""></p>
<pre><code>df_Af[&#39;City&#39;].value_counts()[1:11].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;seagreen&#39;)
plt.xlabel(&quot;City&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 most effected city in Afghanistan(2012-2014)&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/7acb7d4f-8e14-4020-9a46-592ffa0ec2bd/image.png" alt=""></p>
<pre><code>df_Af[&#39;Attacktype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;cadetblue&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Attacktype&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 5 Attacktype, Afghanistan (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/d6206521-249c-4767-89e4-afb404d8dab9/image.png" alt=""></p>
<pre><code>df_Af[&#39;Targettype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;cadetblue&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Targettype&quot;)
plt.ylabel(&quot;count&quot;)
plt.title(&quot;Top 5 Targettype, Afghanistan (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/710dd076-05c3-441c-8f05-5e2058bf285a/image.png" alt=""></p>
<pre><code>df_Pakistan = terr_topRate[terr_topRate[&#39;Country&#39;] == &#39;Pakistan&#39;]

fig,ax = plt.subplots(figsize=(12,4))
ax = sns.barplot(x=df_Pakistan.Group.value_counts()[1:6].values,y = df_Pakistan.Group.value_counts()[1:6].index, palette=&#39;YlOrBr&#39;)
ax.set_title(&#39;Terrorist Groups, Pakistan (2012-2014)&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/b3a479b4-1352-4a4c-ae87-24fb3163ceb6/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(4, 2))
sns.barplot(x=df_Paki_TTP[&#39;Year&#39;].value_counts().index, y=df_Paki_TTP[&#39;Year&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Terror Attacks by TTP, Pakistan (2012-2014)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/2e032ba6-f7c7-4412-bba4-62af027aaeb2/image.png" alt=""></p>
<pre><code>df_Pakistan[&#39;City&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;khaki&#39;)
plt.xlabel(&quot;City&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 most effected city in Pakistan (2012-2014)&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/a147a76a-34c8-4c0f-a56b-2656c8ef2540/image.png" alt=""></p>
<pre><code>df_Pakistan[&#39;Attacktype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;darkkhaki&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Attacktype&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 5 Attacktype, Pakistan (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/66b77cf3-69f6-4dcb-9282-1fbf43b72d59/image.png" alt=""></p>
<pre><code>df_Pakistan[&#39;Targettype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;darkkhaki&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Targettype&quot;)
plt.ylabel(&quot;count&quot;)
plt.title(&quot;Top 5 Targettype, Pakistan (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/0794b519-1132-4f77-b707-fff752767bfa/image.png" alt=""></p>
<pre><code>df_Nigeria = terr_topRate[terr_topRate[&#39;Country&#39;] == &#39;Nigeria&#39;]

fig,ax = plt.subplots(figsize=(12,4))
ax = sns.barplot(x=df_Nigeria.Group.value_counts()[:5].values,y = df_Nigeria.Group.value_counts()[:5].index, palette=&#39;ch:start=.2,rot=-.3&#39;)
ax.set_title(&#39;Terrorist Groups, Nigeria (2012-2014)&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/71cc62a7-d339-4871-9217-e40742c50e0d/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(4, 2))
sns.barplot(x=df_Ni_Bo[&#39;Year&#39;].value_counts().index, y=df_Ni_Bo[&#39;Year&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Terror Attacks by Taliban, Iraq (2012-2014)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/d6dadbcf-2aae-4413-8d98-e96088e3b1e9/image.png" alt=""></p>
<pre><code>df_Nigeria[&#39;City&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;lightslategray&#39;)
plt.xlabel(&quot;City&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 most effected city in Nigeria (2012-2014)&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/b4c4d6c7-bb48-4062-a1cc-a3cc231ff35c/image.png" alt=""></p>
<pre><code>df_Nigeria[&#39;Attacktype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;darkgrey&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Attacktype&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 5 Attacktype, Nigeria (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/15256451-4a22-40b8-a547-133c27316409/image.png" alt=""></p>
<pre><code>df_Nigeria[&#39;Targettype&#39;].value_counts()[:5].plot(kind=&#39;bar&#39;,figsize=(12, 4),color=&#39;darkgrey&#39;)
plt.xticks(rotation=0, fontsize=8)
plt.xlabel(&quot;Targettype&quot;)
plt.ylabel(&quot;count&quot;)
plt.title(&quot;Top 5 Targettype, Nigeria (2012-2014)&quot;,fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/d16550a1-2d4b-478a-a4ad-818468a912d7/image.png" alt=""></p>
<p>연도별(10년 단위) 특성 분석</p>
<pre><code>terr_df[&#39;Year&#39;].unique()</code></pre><pre><code>df_70s = terr_df.loc[(terr_df[&#39;Year&#39;]==1970) | (terr_df[&#39;Year&#39;]==1971) | (terr_df[&#39;Year&#39;]==1972) | (terr_df[&#39;Year&#39;]==1973) | (terr_df[&#39;Year&#39;]==1974) | (terr_df[&#39;Year&#39;]==1975) | (terr_df[&#39;Year&#39;]==1976) | (terr_df[&#39;Year&#39;]==1977) | (terr_df[&#39;Year&#39;]==1978) | (terr_df[&#39;Year&#39;]==1979)]
df_80s = terr_df.loc[(terr_df[&#39;Year&#39;]==1980) | (terr_df[&#39;Year&#39;]==1981) | (terr_df[&#39;Year&#39;]==1982) | (terr_df[&#39;Year&#39;]==1983) | (terr_df[&#39;Year&#39;]==1984) | (terr_df[&#39;Year&#39;]==1985) | (terr_df[&#39;Year&#39;]==1986) | (terr_df[&#39;Year&#39;]==1987) | (terr_df[&#39;Year&#39;]==1988) | (terr_df[&#39;Year&#39;]==1989)]
df_90s = terr_df.loc[(terr_df[&#39;Year&#39;]==1990) | (terr_df[&#39;Year&#39;]==1991) | (terr_df[&#39;Year&#39;]==1992) | (terr_df[&#39;Year&#39;]==1993) | (terr_df[&#39;Year&#39;]==1994) | (terr_df[&#39;Year&#39;]==1995) | (terr_df[&#39;Year&#39;]==1996) | (terr_df[&#39;Year&#39;]==1997) | (terr_df[&#39;Year&#39;]==1998) | (terr_df[&#39;Year&#39;]==1999)]
df_00s = terr_df.loc[(terr_df[&#39;Year&#39;]==2000) | (terr_df[&#39;Year&#39;]==2001) | (terr_df[&#39;Year&#39;]==2002) | (terr_df[&#39;Year&#39;]==2003) | (terr_df[&#39;Year&#39;]==2004) | (terr_df[&#39;Year&#39;]==2005) | (terr_df[&#39;Year&#39;]==2006) | (terr_df[&#39;Year&#39;]==2007) | (terr_df[&#39;Year&#39;]==2008) | (terr_df[&#39;Year&#39;]==2009)]
df_10s = terr_df.loc[(terr_df[&#39;Year&#39;]==2010) | (terr_df[&#39;Year&#39;]==2011) | (terr_df[&#39;Year&#39;]==2012) | (terr_df[&#39;Year&#39;]==2013) | (terr_df[&#39;Year&#39;]==2014) | (terr_df[&#39;Year&#39;]==2015) | (terr_df[&#39;Year&#39;]==2016) | (terr_df[&#39;Year&#39;]==2017)]</code></pre><pre><code>pd.crosstab(df_70s.Year,df_70s.Region).plot(figsize=(12,4))
# plt.title(&#39;Terrorist Attack By Region&#39;)
plt.title(&#39;Terrorism By Region&#39;)
plt.ylabel(&#39;counts&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/8414b45a-6000-4080-ac2c-2b538c5f7c7e/image.png" alt=""></p>
<p>70년대</p>
<pre><code>plt.figure(figsize=(12,4))
sns.barplot(x = df_70s[&#39;Region&#39;].value_counts().values[:10], y = df_70s[&#39;Region&#39;].value_counts()[:10].index,palette = &#39;autumn&#39;)
plt.xlabel(&#39;Number of Attacks&#39;)
plt.ylabel(&#39;Region&#39;)
plt.title(&#39;Top 10 Attacks in 70s&#39;,size=15)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/f05b9123-4834-43a6-b6cb-4db7a31c16b1/image.png" alt=""></p>
<pre><code>pd.crosstab(df_70s.Region,df_70s.Attacktype).plot.barh(stacked=True)
fig=plt.gcf()
fig.set_size_inches(12,7)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/3d3673b9-1a91-4f45-a253-81a0364d2ae4/image.png" alt=""></p>
<pre><code>pd.crosstab(df_70s.Region,df_70s.Weapon).plot.barh(stacked=True)
fig=plt.gcf()
fig.set_size_inches(12,7)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/e1a9e51f-f604-45a9-9923-4c43318b3e34/image.png" alt=""></p>
<pre><code>pd.crosstab(df_70s.Region,df_70s.Targettype).plot.barh(stacked=True)
fig=plt.gcf()
fig.set_size_inches(12,7)
plt.show()ㅊ</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/67b06ae9-4986-499b-a982-5cbca579dd77/image.png" alt=""></p>
<pre><code>attack_data = df_70s.groupby(&#39;Region&#39;)[[&#39;Kill&#39;, &#39;Wound&#39;]].sum()
attack_data.plot(kind=&#39;bar&#39;, stacked=True,figsize = (12,4))
plt.xlabel(&#39;Region&#39;)
plt.ylabel(&#39;Count&#39;)
plt.title(&#39;Kill &amp; Wound in 70s&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/3eefab82-b1a0-4768-9b6f-d8cee3651037/image.png" alt=""></p>
<p>80년대</p>
<pre><code>plt.figure(figsize=(12,4))
sns.barplot(x = df_80s[&#39;Region&#39;].value_counts().values[:10], y = df_80s[&#39;Region&#39;].value_counts()[:10].index,palette = &#39;autumn&#39;)
plt.xlabel(&#39;Number of Attacks&#39;)
plt.ylabel(&#39;Region&#39;)
plt.title(&#39;Top 10 Attacks in 80s&#39;,size=15)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/dc5e7205-8f63-4a26-b628-4a1e1e3bcbca/image.png" alt=""></p>
<pre><code>pd.crosstab(df_80s.Region,df_80s.Attacktype).plot.barh(stacked=True)
fig=plt.gcf()
fig.set_size_inches(12,7)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/8e428126-3050-4636-a93d-20232e17f4b7/image.png" alt=""></p>
<pre><code>pd.crosstab(df_80s.Region,df_80s.Weapon).plot.barh(stacked=True)
fig=plt.gcf()
fig.set_size_inches(12,7)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/99a387e0-00ba-446b-976b-1143b7873d8f/image.png" alt=""></p>
<pre><code>df_80s[&#39;Weapon&#39;].value_counts()[:5].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;lightslategray&#39;)
plt.xlabel(&quot;Weapon&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 5 Weapons in 80s&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/2e986f15-6b40-4456-9892-d49facb38bca/image.png" alt=""></p>
<pre><code>df_80s[&#39;Targettype&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;lightslategray&#39;)
plt.xlabel(&quot;Targettype&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 Targettype in 80s&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/c70a1623-e550-48cc-b75f-14574351634d/image.png" alt=""></p>
<p>90년대</p>
<pre><code>plt.figure(figsize=(12,4))
sns.barplot(x = df_90s[&#39;Region&#39;].value_counts().values[:10], y = df_90s[&#39;Region&#39;].value_counts()[:10].index,palette = &#39;autumn&#39;)
plt.xlabel(&#39;Number of Attacks&#39;)
plt.ylabel(&#39;Region&#39;)
plt.title(&#39;Top 10 Attacks in 90s&#39;,size=15)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/979d3f2b-3acd-4423-8418-7ddfff40bb3c/image.png" alt=""></p>
<pre><code>pd.crosstab(df_90s.Region,df_90s.Attacktype).plot.barh(stacked=True)
fig.set_size_inches(12,4)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/98a1c23a-a3c5-4973-8a83-49cb3ace40dc/image.png" alt=""></p>
<pre><code>df_90s[&#39;Attacktype&#39;].value_counts()[:5].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;lightslategray&#39;)
plt.xlabel(&quot;Weapon&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 5 Attacktype in 90s&quot;, fontsize=15)
plt.xticks(rotation=0)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/7440e07b-0aaf-4464-9e90-c6bc8f6881d2/image.png" alt=""></p>
<pre><code>df_90s[&#39;Targettype&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;lightslategray&#39;)
plt.xlabel(&quot;Targettype&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 Targettype in 90s&quot;, fontsize=15)
plt.xticks(rotation=30)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/033f1843-6fe7-432a-9481-7808c17f7d73/image.png" alt=""></p>
<p>2000년대</p>
<pre><code>plt.figure(figsize=(12,4))
sns.barplot(x = df_00s[&#39;Region&#39;].value_counts().values[:10], y = df_00s[&#39;Region&#39;].value_counts()[:10].index,palette = &#39;autumn&#39;)
plt.xlabel(&#39;Number of Attacks&#39;)
plt.ylabel(&#39;Region&#39;)
plt.title(&#39;Top 10 Attacks in 2000&#39;,size=15)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/f0236f6a-fbb5-4066-9e2b-cf69a1c743b9/image.png" alt=""></p>
<pre><code>attack_data = df_00s.groupby(&#39;Region&#39;)[[&#39;Kill&#39;, &#39;Wound&#39;]].sum()
attack_data.plot(kind=&#39;bar&#39;, stacked=True,figsize = (12,4))
plt.xlabel(&#39;Region&#39;)
plt.ylabel(&#39;Count&#39;)
plt.title(&#39;Kill &amp; Wound in 90s&#39;)
plt.xticks(rotation=30)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/5323e68a-7e3e-4041-a6d7-7e73ca0090b9/image.png" alt=""></p>
<pre><code>df_2000_MN = df_00s[df_00s[&#39;Region&#39;] == &#39;Middle East &amp; North Africa&#39;]

plt.figure(figsize=(12, 4))
sns.barplot(x=df_2000_MN[&#39;Year&#39;].value_counts().index, y=df_2000_MN[&#39;Year&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Terror Attacks in Middle East &amp; North Africa (2000s)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/a89348ff-076e-48b7-8fbe-74ca143ad46c/image.png" alt=""></p>
<pre><code>a = df_2000_MN[df_2000_MN[&#39;Year&#39;] == 2008]
a[&#39;City&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;cadetblue&#39;)
plt.xlabel(&quot;City&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 most effected city in Middle East &amp; North Africa (2008y)&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/eccdf9c2-454e-4487-9417-92928aa351ab/image.png" alt=""></p>
<pre><code>a = df_00s[(df_00s[&#39;City&#39;] == &#39;Baghdad&#39;) | (df_00s[&#39;Year&#39;] == 2008)]
plt.figure(figsize=(12, 4))
sns.barplot(x=a[&#39;Targettype&#39;].value_counts()[:5].index, y=a[&#39;Targettype&#39;].value_counts()[:5].values, palette=&#39;viridis&#39;)
plt.title(&#39;Targettype in Baghdad(Middle East &amp; North Africa, 2008y)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/ff341bbd-25bc-43cd-9208-832c12f2593a/image.png" alt=""></p>
<pre><code>df_2000_SA = df_00s[df_00s[&#39;Region&#39;] == &#39;South Asia&#39;]

plt.figure(figsize=(12, 4))
sns.barplot(x=df_2000_SA[&#39;Year&#39;].value_counts().index, y=df_2000_SA[&#39;Year&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Terror Attacks in South Asia (2000s)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/63f21099-a052-4ae6-b2e3-8e673a1c3c3d/image.png" alt=""></p>
<pre><code>b = df_2000_SA[df_2000_SA[&#39;Year&#39;] == 2009]
b[&#39;City&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;cadetblue&#39;)
plt.xlabel(&quot;City&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 most effected city in South Asia (2009y)&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/5f98cc0d-de75-40ad-9d74-25cb5be0a5ef/image.png" alt=""></p>
<pre><code>b = df_00s[(df_00s[&#39;City&#39;] == &#39;Quetta&#39;) | (df_00s[&#39;Year&#39;] == 2009)]
plt.figure(figsize=(12, 4))
sns.barplot(x=b[&#39;Targettype&#39;].value_counts()[:5].index, y=b[&#39;Targettype&#39;].value_counts()[:5].values, palette=&#39;viridis&#39;)
plt.title(&#39;Targettype in Quetta(South Asia, 2009y)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/190cdfef-0597-4485-8a73-c3411187cc3b/image.png" alt=""></p>
<pre><code>b = df_00s[(df_00s[&#39;City&#39;] == &#39;Peshawar&#39;) | (df_00s[&#39;Year&#39;] == 2009)]
plt.figure(figsize=(12, 4))
sns.barplot(x=b[&#39;Targettype&#39;].value_counts()[:5].index, y=b[&#39;Targettype&#39;].value_counts()[:5].values, palette=&#39;YlOrBr&#39;)
plt.title(&#39;Targettype in Peshawar(South Asia, 2009y)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/009b3a71-ef70-4596-8f01-750c1c0aec0b/image.png" alt=""></p>
<p>2010년</p>
<pre><code>plt.figure(figsize=(12,4))
sns.barplot(x = df_10s[&#39;Region&#39;].value_counts().values[:10], y = df_10s[&#39;Region&#39;].value_counts()[:10].index,palette = &#39;autumn&#39;)
plt.xlabel(&#39;Number of Attacks&#39;)
plt.ylabel(&#39;Region&#39;)
plt.title(&#39;Top 10 Attacks in 2010s&#39;,size=15)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/50d76f37-3df6-4982-8417-c464e271cddc/image.png" alt=""></p>
<pre><code>attack_data = df_10s.groupby(&#39;Region&#39;)[[&#39;Kill&#39;, &#39;Wound&#39;]].sum()
attack_data.plot(kind=&#39;bar&#39;, stacked=True,figsize = (12,4))
plt.xlabel(&#39;Region&#39;)
plt.ylabel(&#39;Count&#39;)
plt.title(&#39;Kill &amp; Wound in 2010s&#39;)
plt.xticks(rotation=90)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/acac1221-120f-458b-8bf9-b9e76342e2fb/image.png" alt=""></p>
<pre><code>df_2010_MN = df_10s[df_10s[&#39;Region&#39;] == &#39;Middle East &amp; North Africa&#39;]

plt.figure(figsize=(12, 4))
sns.barplot(x=df_2010_MN[&#39;Year&#39;].value_counts().index, y=df_2010_MN[&#39;Year&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Terror Attacks in Middle East &amp; North Africa (2010s)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/d9b7746d-09d3-40d5-bc94-8846e5866ffa/image.png" alt=""></p>
<pre><code>c = df_2010_MN[df_2010_MN[&#39;Year&#39;] == 2014]
c[&#39;City&#39;].value_counts()[:10].to_frame().sort_values(&#39;count&#39;, ascending=False).plot(kind=&#39;bar&#39;,figsize=(12,4),color=&#39;cadetblue&#39;)
plt.xlabel(&quot;City&quot;)
plt.ylabel(&quot;Number of attack&quot;)
plt.title(&quot;Top 10 most effected city in Middle East &amp; North Africa (2014y)&quot;, fontsize=15)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/e92a947f-a858-4956-9315-1e6887aca022/image.png" alt=""></p>
<pre><code>d = df_10s[(df_10s[&#39;City&#39;] == &#39;Baghdad&#39;) | (df_10s[&#39;Year&#39;] == 2014)]
plt.figure(figsize=(12, 4))
sns.barplot(x=d[&#39;Targettype&#39;].value_counts()[:5].index, y=d[&#39;Targettype&#39;].value_counts()[:5].values, palette=&#39;viridis&#39;)
plt.title(&#39;Targettype in Baghdad(Middle East &amp; North Africa, 2014)&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/533d0e47-e193-4902-98d0-8c2d46c3b70c/image.png" alt=""></p>
<p>한국 집계</p>
<pre><code>#Preparing the data for analysis
Ko = terr_df[terr_df.Country == &#39;South Korea&#39;]
Ko_cities = Ko.groupby(by=&#39;City&#39;,as_index=False).count().sort_values(by=&#39;eventid&#39;,ascending=False).iloc[:5,]

Ko_kill_size = Ko[&#39;Kill&#39;].sum() / len(Ko)
labels = [&#39;Kill&#39;, &#39;Not Kill&#39;]

Ko_year = Ko.groupby(by=&#39;Year&#39;, as_index=False).sum().loc[:, [&#39;Year&#39;, &#39;Kill&#39;]]

Iraq_weapon = Ko.groupby(by=&#39;Weapon&#39;,as_index=False).count().sort_values(by=&#39;eventid&#39;,ascending=False).iloc[:,:2]
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))

# Plot 1 - Top 5 terrorism cities    
sns.barplot(x=&#39;eventid&#39;, y=&#39;City&#39;, data=Ko_cities, ci=None, ax=axs[0, 0],palette=&#39;summer&#39;)
axs[0, 0].set_title(f&#39;Top 5 South Korea Cities With Most Terrorism Occurences&#39;)
axs[0, 0].set_ylabel(&#39;City&#39;)
axs[0, 0].set_xlabel(&#39;Victims&#39;)

# Plot 2 - Suicide Rate
center_circle = plt.Circle((0,0), 0.75, color=&#39;white&#39;)
axs[0, 1].pie((Ko_kill_size, 1-Ko_kill_size), labels=labels,colors=[&#39;crimson&#39;,&#39;green&#39;] , autopct=&#39;%1.1f%%&#39;)
axs[0, 1].add_artist(center_circle)
axs[0, 1].set_title(&#39;South Korea Terrorism kill Rate&#39;)
axs[0, 0].set_ylabel(&#39;Victims&#39;)

# Plot 3 - Victims through the years
sns.lineplot(x=&#39;Year&#39;, y=&#39;Kill&#39;, data=Ko_year, ax=axs[1, 0],color=&#39;crimson&#39;)
axs[1, 0].set_xlim([1970, 2017])
axs[1, 0].set_title(&#39;South Korea Number of Victims Over Time&#39;)
axs[1, 0].set_ylabel(&#39;Victims&#39;)

# Plot 4 - Terrorism Weapons
sns.barplot(x=&#39;Weapon&#39;, y=&#39;eventid&#39;, data=Iraq_weapon, ci=None, ax=axs[1, 1],palette=&#39;summer&#39;)
axs[1, 1].set_xticklabels(axs[1, 1].get_xticklabels(), rotation=90)
axs[1, 1].set_xlabel(&#39;&#39;)
axs[1, 1].set_ylabel(&#39;Count&#39;)
axs[1, 1].set_title(&#39;South Korea Weapons Used in Attacks&#39;)

plt.suptitle(&#39;Terrorism Analysis in South Korea between 1970 and 2017&#39;, size=16)    
plt.subplots_adjust(top=0.90)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/1de986ec-6e46-4128-9489-6c7613e666c4/image.png" alt=""></p>
<pre><code>pd.crosstab(Ko.Year,Ko.City).plot(figsize=(12,4))
# plt.title(&#39;Terrorist Attack By Region&#39;)
plt.title(&#39;Terrorism in Korea&#39;)
plt.ylabel(&#39;counts&#39;)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/1823476e-3733-4b88-af61-3dcbc0667dce/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(12, 4))
sns.barplot(x=Ko[&#39;City&#39;].value_counts().index, y=Ko[&#39;City&#39;].value_counts().values, palette=&#39;viridis&#39;)
plt.title(&#39;Attacks by Student Radicals, Korea&#39;)
plt.xlabel(&#39;Years&#39;)
plt.ylabel(&#39;Number of Attacks&#39;)
plt.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/9341feaf-6006-4a53-84b7-884a77d4e4bc/image.png" alt=""></p>
<pre><code>plt.figure(figsize=(12,4))
sns.barplot(x = Ko[&#39;City&#39;].value_counts().values, y = Ko[&#39;City&#39;].value_counts().index,palette = &#39;autumn&#39;)
plt.xlabel(&#39;Number of Attacks&#39;)
plt.ylabel(&#39;Region&#39;)
plt.title(&#39;Attacks in Korea&#39;,size=15)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/2fc4d170-ebfd-4546-9aba-4e034693d255/image.png" alt=""></p>
<p>제로베이스 데이터 스쿨</p>
]]></description>
        </item>
        <item>
            <title><![CDATA[SQL - mini test _ 세계 테러 분석]]></title>
            <link>https://velog.io/@jaam_mini/SQL-mini-test-%EC%84%B8%EA%B3%84-%ED%85%8C%EB%9F%AC-%EB%B6%84%EC%84%9D</link>
            <guid>https://velog.io/@jaam_mini/SQL-mini-test-%EC%84%B8%EA%B3%84-%ED%85%8C%EB%9F%AC-%EB%B6%84%EC%84%9D</guid>
            <pubDate>Sun, 28 Jan 2024 16:50:57 GMT</pubDate>
            <description><![CDATA[<h2 id="원본-data-download">원본 Data Download</h2>
<ul>
<li><a href="https://www.kaggle.com/datasets/START-UMD/gtd">https://www.kaggle.com/datasets/START-UMD/gtd</a></li>
</ul>
<blockquote>
<p>문제 1.  csv 파일에 저장된 세계 테러 데이터를 하나의 테이블에 저장하세요. </p>
</blockquote>
<ul>
<li>globalterrorismmdb_0718.csv - <a href="https://www.kaggle.com/datasets/START-UMD/gtd">https://www.kaggle.com/datasets/START-UMD/gtd</a></li>
<li>pandas의 to_sql 함수 사용 (힌트 : create_engine)</li>
<li>Table name : origin_terror_data </li>
</ul>
<pre><code># !pip install sqlalchemy
# !pip install pymysql
# !pip install SQLAlchemy Flask-SQLAlchemy</code></pre><pre><code># 실수로 oneday DB를 삭제해서...다시 생성(1)
import mysql.connector

conn = mysql.connector.connect(
    host = &quot;내꺼&quot;,
    port = 3306,
    user = &quot;admin&quot;,
    password = &quot;내꺼&quot;,
    database = &quot;zerobase&quot;
)

cursor = conn.cursor(buffered = True)
sql = &#39;create database oneday default character set utf8mb4&#39;
cursor.execute(sql)

cursor.execute(&quot;create user &#39;oneday&#39;@&#39;%&#39; identified by &#39;1234&#39;&quot;)
cursor.execute(&quot;grant all on oneday.* to &#39;oneday&#39;@&#39;%&#39;&quot;)

conn = mysql.connector.connect(
    host = &quot;내꺼&quot;,
    port = 3306,
    user = &quot;oneday&quot;,
    password = &quot;1234&quot;,
    database = &quot;oneday&quot;
)</code></pre><pre><code>import time
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import configparser

# 데이터 불러오기
df = pd.read_csv(&#39;./globalterrorismdb_0718dist.csv&#39;, encoding=&#39;ISO-8859-1&#39;)

# DB 접속 엔진 객체 생성
user = &#39;oneday&#39;
password = &#39;1234&#39;
host = &#39;내꺼&#39;
port = 3306
database = &#39;oneday&#39;

# Engine 객체 설정 (URL 문자열을 사용하여 데이터베이스 호스트 연결)
engine = create_engine(f&#39;mysql+pymysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4&#39;)</code></pre><pre><code># DB 테이블 명(생성될 테이블 이름)
table_name = &quot;origin_terror_data&quot;

# DB에 DataFrame 적재
df.to_sql(index = False, 
          name = table_name,
          con = engine,
          if_exists = &#39;append&#39;,
          method = &#39;multi&#39;, 
          chunksize = 10000)</code></pre><pre><code># (에러) &#39;Engine&#39; object has no attribute &#39;execute&#39;
# records = engine.execute(&quot;SELECT COUNT(*) FROM origin_terror_data&quot;).fetchall()
# print(records)

# 방법을 바꿈 (참고 : https://blog.naver.com/lechga/223290561410)
from sqlalchemy import text

conn = engine.connect()

with engine.connect() as conn:
    records = conn.execute(text(&quot;SELECT COUNT(*) FROM origin_terror_data&quot;)).fetchall()
    print(records)</code></pre><blockquote>
<p>문제 2. origin_terror_data 에서 region, country 관련 데이터는 code 와 txt (name) 속성으로 정의되어 있습니다. 
문제 2-1. Region 및 Country 테이블을 그림과 같은 구조로 생성하세요. </p>
</blockquote>
<ul>
<li>origin_terror_data 를 분석하여 각 테이블의 데이터 타입을 정의하세요. </li>
<li>문자열 데이터의 사이즈는 origin_terror_data 테이블에서 해당 데이터의 max length 를 쿼리로 체크하여 정의하세요. </li>
<li>Region 과 Country 데이터 사이의 관계를 파악하여 Foreign Key 를 설정하세요.
참고&gt; </li>
<li>Region.region_code     = origin_terror_data.region</li>
<li>Region.region_name     = origin_terror_data.region_txt</li>
<li>Country.country_code   = origin_terror_data.country</li>
<li>Country.country_name   = origin_terror_data.country_txt</li>
</ul>
<pre><code>##############################################################################################
#  문제 2. region / country / city 데이터 추출하여 데이터베이스로 변환하기 
#  중복을 제거한 code - name 값 
#  region - country - city 관계 정의 
##############################################################################################
import mysql.connector

conn = mysql.connector.connect(
    host = &#39;database-1.cj22sogoe8oa.ap-southeast-2.rds.amazonaws.com&#39;,
    port = 3306,
    user = &quot;oneday&quot;,
    password = &quot;1234&quot;,
    database = &quot;oneday&quot;
)
cursor = conn.cursor(buffered=True)</code></pre><pre><code># region_txt 의 max length 체크 
region_maxL = &#39;select max(char_length(region_txt)) from origin_terror_data&#39;

cursor.execute(region_maxL)
result = cursor.fetchall()
result</code></pre><pre><code># Region 테이블 만들기
region_table = (&quot;create table Region(region_code int not null auto_increment primary key, region_name varchar(32))&quot;)
cursor.execute(region_table)
conn.commit()</code></pre><pre><code>sql = (&#39;desc Region&#39;)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code># country_txt 의 max length 체크 

country_maxL = &quot;select max(char_length(country_txt)) from origin_terror_data&quot;

cursor.execute(country_maxL)
result = cursor.fetchall()
result</code></pre><pre><code># Country 테이블 만들기 
country_table = (&quot;create table Country (&quot;
        &quot;country_code int not null auto_increment primary key, &quot;
        &quot;region_code int, &quot;
        &quot;country_name varchar(32), &quot;
        &quot;foreign key (region_code) references Region(region_code)&quot;
        &quot;)&quot;
        )
cursor.execute(country_table)
conn.commit()</code></pre><pre><code># desc country
sql = (&#39;desc Country&#39;)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><blockquote>
<p>문제 2-2. origin_terror_data 테이블에서 Region 및 Country 데이터를 추출하여 문제 2-1.에서 생성한 테이블에 입력하고 확인하세요.   </p>
</blockquote>
<ul>
<li>중복을 제거한 Unique Data 를 추출하세요. </li>
<li>데이터를 INSERT 할때 순서를 고민하세요. </li>
</ul>
<pre><code># Region 데이터 추출하기 

region_data = (&#39;SELECT DISTINCT region, region_txt FROM origin_terror_data ORDER BY region ASC&#39;)
cursor.execute(region_data)

result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code># Region 데이터 추출하기 

region_data = (&#39;SELECT DISTINCT region, region_txt FROM origin_terror_data ORDER BY region ASC&#39;)
cursor.execute(region_data)

result = cursor.fetchall()
print(result)</code></pre><pre><code># Region 테이블에 INSERT 
region_insert = (&#39;INSERT INTO Region(region_code, region_name) SELECT DISTINCT region, region_txt FROM origin_terror_data&#39;)

cursor.execute(region_insert)
conn.commit()</code></pre><pre><code># Country 데이터 추출하기 
country_data = (&#39;SELECT DISTINCT country, country_txt FROM origin_terror_data&#39;)
cursor.execute(country_data)

result = cursor.fetchall()
print(result)</code></pre><pre><code># Country 테이블에 INSERT 
country_insert = (&quot;INSERT INTO Country (country_code, region_code, country_name)&quot;
        &quot;SELECT DISTINCT country, region, country_txt FROM origin_terror_data;&quot;)

cursor.execute(country_insert)
conn.commit()</code></pre><blockquote>
<p>문제 3.  origin_terror_data 에서 attack type, target type, weapon type 관련 데이터는 code 와 txt 속성으로 정의되어 있습니다. 
문제 3-1. AttackType, TargetType, WeaponType 테이블을 그림과 같은 구조로 생성하세요. </p>
</blockquote>
<ul>
<li>origin_terror_data 를 분석하여 각 테이블의 데이터 타입을 정의하세요. </li>
<li>문자열 데이터의 사이즈는 origin_terror_data 테이블에서 해당 데이터의 max length 를 쿼리로 체크하여 정의하세요. 
참고&gt;</li>
<li>AttackType.attacktype_code    = origin_terror_data.attacktype1</li>
<li>AttackType.attacktype_desc    = origin_terror_data.attacktype1_txt </li>
<li>TargetType.targtype_code      = origin_terror_data.targtype1</li>
<li>TargetType.targtype_desc      = origin_terror_data.targtype1_txt</li>
<li>WeaponType.weaptype_code      = origin_terror_data.weaptype1</li>
<li>WeaponType.weaptype_desc      = origin_terror_data.weaptype1_txt </li>
</ul>
<pre><code># attacktype1 의 max length 체크 

attacktype1 = &quot;SELECT max(char_length(attacktype1_txt)) FROM origin_terror_data&quot;

cursor.execute(attacktype1)
result = cursor.fetchall()
result</code></pre><pre><code># AttackType 테이블 만들기 
sql = (&quot;CREATE table AttackType (&quot;
        &quot;attacktype_code int not null auto_increment primary key, &quot;
        &quot;attacktype_desc varchar(35) &quot;
        &quot;)&quot;
        )

cursor.execute(sql)
conn.commit()</code></pre><pre><code># targettype1 의 max length 체크 
targettype1_l = &#39;SELECT max(char_length(targtype1_txt)) FROM origin_terror_data;&#39;

cursor.execute(targettype1_l)
result = cursor.fetchall()
result</code></pre><pre><code># TargetType 테이블 만들기 
TargetType = (&quot;CREATE table TargetType (&quot;
        &quot;targtype_code int not null auto_increment primary key, &quot;
        &quot;targtype_desc varchar(32) &quot;
        &quot;)&quot;
        )

cursor.execute(TargetType)
conn.commit()</code></pre><pre><code># weaptype1 의 max length 체크 
weaptype1_l = &#39;SELECT max(char_length(weaptype1_txt)) FROM origin_terror_data&#39;

cursor.execute(weaptype1_l)
result = cursor.fetchall()
result</code></pre><pre><code># WeaponType 테이블 만들기 
WeaponType = (&quot;create table WeaponType (&quot;
        &quot;weaptype_code int not null auto_increment primary key, &quot;
        &quot;weaptype_desc varchar(80) &quot;
        &quot;)&quot;
        )

cursor.execute(WeaponType)
conn.commit()</code></pre><blockquote>
<p>문제 3-2. origin_terror_data 테이블에서 Attack Type, Target Type, Weapon Type 데이터를 추출하여 문제 3-1.에서 생성한 테이블에 입력하고 확인하세요.   </p>
</blockquote>
<ul>
<li>중복을 제거한 Unique Data 를 추출하세요. </li>
<li>데이터를 INSERT 할때 순서를 고민하세요.</li>
</ul>
<pre><code># attacktype1 데이터 추출하기 
attacktype1_data = (&#39;SELECT DISTINCT attacktype1, attacktype1_txt FROM origin_terror_data ORDER BY attacktype1 ASC&#39;)
cursor.execute(attacktype1_data)

result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code># AttackType 테이블에 INSERT 
AttackType_data = (&quot;INSERT INTO AttackType (attacktype_code, attacktype_desc)&quot;
        &quot;SELECT DISTINCT attacktype1, attacktype1_txt FROM origin_terror_data&quot;)

cursor.execute(AttackType_data)
conn.commit()</code></pre><pre><code># desc AttackType
sql = (&#39;desc AttackType&#39;)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code># targtype1 데이터 추출하기 
targtype1_data = (&#39;SELECT DISTINCT targtype1, targtype1_txt FROM origin_terror_data ORDER BY targtype1 ASC &#39;)
cursor.execute(targtype1_data)

result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code># TargetType 테이블에 INSERT 

# ALTER TABLE tablename
# CHANGE COLUMN old_columnname new_columnname new_datatype;

targetType_insert = (&#39;INSERT INTO TargetType (targtype_code, targtype_desc) SELECT DISTINCT targtype1, targtype1_txt FROM origin_terror_data&#39;)

cursor.execute(targetType_insert)
conn.commit()</code></pre><pre><code># desc TargetType
sql = (&#39;desc TargetType&#39;)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code># weaptype1 데이터 추출하기 
weaptype1_data = (&#39;SELECT DISTINCT weaptype1, weaptype1_txt FROM origin_terror_data ORDER BY weaptype1 ASC&#39;)
cursor.execute(weaptype1_data)

result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code># WeaponType 테이블에 INSERT 
sql = (&#39;INSERT INTO WeaponType (weaptype_code, weaptype_desc) SELECT DISTINCT weaptype1, weaptype1_txt FROM origin_terror_data&#39;)

cursor.execute(sql)
conn.commit()</code></pre><pre><code># desc WeaponType
sql = (&#39;desc WeaponType&#39;)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><blockquote>
<p>문제 4. TerrorData 테이블을 만들고 앞서 만들어둔 테이블과 관계를 설정하도록 하겠습니다. 
문제 4-1. TerrorData 테이블을 앞의 그림과 같이 생성하세요. </p>
</blockquote>
<ul>
<li>origin_terror_data 를 분석하여 데이터 타입을 정의하세요. </li>
<li>문자열 데이터의 사이즈는 origin_terror_data 테이블에서 해당 데이터의 max length 를 쿼리로 체크하여 정의하세요. 
참고&gt; </li>
<li>TerrorData.terror id : Auto Increment </li>
<li>TerrorData.city_name    = origin_terror_data.city</li>
<li>TerrorData.target       = orgin_terror_data.target1</li>
<li>TerrorData.group_name   = origin_terror_data.gname</li>
<li>TerrorData.kill_count   = origin_terror_data.nkill</li>
<li>TerrorData.wound_count  = origin_terror_data.nwound</li>
<li>TerrorData.motive       = origin_terror_data.motive</li>
<li>TerrorData.summary      = origin_terror_data.summary</li>
<li>TerrorData.latitude     = origin_terror_data_latitude</li>
<li>TerrorData.longitude    = origin_terror_data_longitude</li>
<li>TerrorData.terror_date  = origin_terror_data.iyear + origin_terror_Data.imonth + origin_terror_data.iday (Date Type)</li>
</ul>
<pre><code># max length 체크 

# city 의 max length 체크 
city_ML = &#39;SELECT max(char_length(city)) FROM origin_terror_data&#39;
cursor.execute(city_ML)
city = cursor.fetchone()

# target1 의 max length 체크 
target1_ML = &#39;SELECT max(char_length(target1)) FROM origin_terror_data&#39;
cursor.execute(target1_ML)
target = cursor.fetchone()

# gname 의 max length 체크 
gname_ML = &#39;SELECT max(char_length(gname)) FROM origin_terror_data&#39;
cursor.execute(gname_ML)
gname = cursor.fetchone()

# summary 의 max length 체크 
summary_ML = &#39;SELECT max(char_length(summary)) FROM origin_terror_data&#39;
cursor.execute(summary_ML)
summary = cursor.fetchone()

# motive 의 max length 체크 
motive_ML = &#39;SELECT max(char_length(motive)) FROM origin_terror_data&#39;
cursor.execute(motive_ML)
motive = cursor.fetchone()

print(f&#39;city: {city}&#39;)
print(f&#39;target1: {target}&#39;)
print(f&#39;gname: {gname}&#39;)
print(f&#39;summary: {summary}&#39;)
print(f&#39;motive: {motive}&#39;)</code></pre><pre><code># TerrorData 테이블 만들기 
terrorData_table = (&quot;create table TerrorData (&quot;
        &quot;terror_id int not null auto_increment primary key, &quot;
        &quot;terror_date date, &quot;
        &quot;region_code int, &quot;
        &quot;country_code int, &quot;
        &quot;city varchar(65), &quot;
        &quot;target varchar(350), &quot;
        &quot;group_name varchar(120), &quot;
        &quot;targtype_code int, &quot;
        &quot;attacktype_code int, &quot;  
        &quot;weaptype_code int, &quot;
        &quot;kill_count int, &quot;
        &quot;wound_count int, &quot;     
        &quot;motive varchar(900), &quot;
        &quot;summary varchar(2450), &quot;
        &quot;latitude decimal(16, 14), &quot;
        &quot;longitude decimal(17, 14), &quot;  
        &quot;foreign key (region_code) references Region(region_code), &quot;
        &quot;foreign key (country_code) references Country(country_code), &quot;
        &quot;foreign key (attacktype_code) references AttackType(attacktype_code), &quot;
        &quot;foreign key (targtype_code) references TargetType(targtype_code), &quot;
        &quot;foreign key (weaptype_code) references WeaponType(weaptype_code) &quot;
        &quot;);&quot;
        )

cursor.execute(terrorData_table)
conn.commit()</code></pre><blockquote>
<p>문제 4-2. origin_terror_data 테이블에서 Terror Data를 추출하여 문제 4-1.에서 생성한 테이블에 입력하고 확인하세요.</p>
</blockquote>
<ul>
<li>앞서 생성한 Region, Country, AttackType, TargetType, WeaponType 데이터와의 관계에 주의하세요.</li>
<li>Count 값을 가지는 칼럼의 값이 null 인 경우, 0으로 예외처리 해주세요.</li>
<li>위도 경도 데이터 중 범위를 넘어서는 데이터가 존재합니다. 이 경우, null 값으로 예외처리 해주세요. (위도 경도 범위 : 구글링해보세요.)</li>
<li>terror_date 칼럼의 경우, origin_terror_data 의 연, 월, 일 정보를 조합하여 date type 으로 정의해주세요. (Format : ‘YYYY-mm-dd’)</li>
<li>origin_terror_data 의 월, 일 정보중 값이 0 인 경우 date type 으로 변환되지 않습니다. 이 경우, 1 로 예외처리 해주세요.</li>
<li>데이터 입력까지 완료한 이후, origin_terror_data 테이블을 삭제하고 확인하세요.</li>
</ul>
<pre><code># data 전처리 
# date type : year + month + day (month = 0 인경우 1, day = 0 인경우 1)
# nkill, nwound : null 인 경우 0
# longitude range : 180 ~ -180
# check : select longitude from origin_terror_data where longitude &lt; -180 or longitude &gt; 180; &#39;-86185896&#39;

sql = (&quot;INSERT INTO TerrorData (region_code, country_code, attacktype_code, targtype_code, weaptype_code, terror_date, city, target, group_name, kill_count, wound_count, motive, summary, latitude, longitude) &quot;
        &quot;SELECT region, country, attacktype1, targtype1, weaptype1, city, target1, gname, &quot;
        &quot;IF(nkill IS NULL, 0, nkill), &quot;   
        &quot;IF(nwound IS NULL, 0, nwound), &quot;
        &quot;motive, summary, &quot;
        &quot;CASE WHEN latitude BETWEEN -90 and 90 THEN NULL ELSE latitude END, &quot;
        &quot;CASE WHEN longitude BETWEEN -180 and 180 THEN NULL ELSE longitude END, &quot;
        &quot;STR_TO_DATE(CONCAT(IF(iyear = 0, 1, iyear)&#39;-&#39;IF(imonth = 0, 1, imonth)&#39;-&#39;IF(iday = 0, 1, iday)), &#39;%Y-%m-%d&#39;) FROM origin_terror_data&quot;)

cursor.execute(sql)
conn.commit()</code></pre><pre><code># terror 데이터 추출하기 
# desc TerrorData
sql = (&#39;desc TerrorData&#39;)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code>sql = &#39;SELECT COUNT(*) FROM TerrorData&#39;
cursor.execute(sql)
result = cursor.fetchall()
print(result)</code></pre><blockquote>
<p>문제 5. TerrorData 의 전체 기간에서 테러의 숫자를 연도별로 집계하여 연도별 테러 발생 건수를 조회하세요. </p>
<blockquote>
<p>답 틀림...🙄</p>
</blockquote>
</blockquote>
<pre><code>sql = &quot;SELECT DATE_FORMAT(terror_date, &#39;%Y&#39;) as Y, count(*) as C FROM TerrorData GROUP BY Y&quot;
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><blockquote>
<p>문제 6. TerrorData 에서 테러가 가장 많이 일어난 순서로 국가를 정렬하여 상위 10위 국가를 조회하세요. </p>
</blockquote>
<pre><code># 테러가 많이 일어난 상위 10위 Region
terrorRegion_10 = (&quot;SELECT R.region_name, COUNT(*) FROM TerrorData as T &quot;
         &quot;JOIN Region as R ON T.region_code = R.region_code &quot;
         &quot;GROUP BY T.region_code ORDER BY COUNT(*) DESC limit 10&quot;
         )

cursor.execute(terrorRegion_10)
result = cursor.fetchall()
for i in result:
    print(i)

# 테러가 많이 일어난 상위 10위 Country 
terrorCountry_10 = (&quot;SELECT C.country_name, COUNT(*) FROM TerrorData as T &quot;
         &quot;JOIN Country as C ON T.country_code = C.country_code &quot;
         &quot;GROUP BY T.country_code ORDER BY COUNT(*) DESC limit 10&quot;
         )

cursor.execute(terrorCountry_10)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><blockquote>
<p>문제 7. TerrorData 에서 테러가 가장 많이 일어난 상위 10위 국가에 대해 국가별로 사망자수와 부상자수, 사상자수(사망자수 + 부상자수)를 조회하세요. </p>
</blockquote>
<pre><code>country_10 = (&quot;SELECT C.country_name, COUNT(*), &quot; # 모든행의 갯수
        &quot;sum(T.kill_count), &quot;
        &quot;sum(T.wound_count), &quot;
        &quot;sum(T.kill_count + T.wound_count) &quot;
        &quot;FROM TerrorData as T &quot;
        &quot;JOIN Country as C ON T.country_code=C.country_code &quot;
        &quot;GROUP BY T.country_code, C.country_code &quot;
        &quot;ORDER BY COUNT(*)&quot; #  COUNT(*) 값을 기준으로 descending
        &quot;DESC limit 10&quot;)

cursor.execute(country_10)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><blockquote>
<p>문제 8. 지역별 테러 공격 형태에 따른 사망자수, 부상자수, 사상자수를 조회하세요.</p>
<blockquote>
<p>답 틀림..🙄</p>
</blockquote>
</blockquote>
<pre><code># 지역별 테러 공격 형태에 따른 사망자와 사상자 수 

region_c = (&quot;SELECT R.region_name, AttackType.attacktype_desc, &quot;
            &quot;sum(T.kill_count), sum(T.wound_count),&quot;
            &quot;sum(T.kill_count + T.wound_count) &quot;
            &quot;FROM TerrorData as T &quot;
            &quot;JOIN Region as R &quot;
            &quot;ON T.region_code = R.region_code &quot;
            &quot;JOIN AttackType ON T.attacktype_code = AttackType.attacktype_code &quot;
            &quot;GROUP BY R.region_name, AttackType.attacktype_desc&quot;)

cursor.execute(region_c)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><p>제로베이스 데이티 스쿨</p>
]]></description>
        </item>
        <item>
            <title><![CDATA[SQL - mini test _ 유가 분석]]></title>
            <link>https://velog.io/@jaam_mini/SQL-mini-test-%EC%9C%A0%EA%B0%80-%EB%B6%84%EC%84%9D</link>
            <guid>https://velog.io/@jaam_mini/SQL-mini-test-%EC%9C%A0%EA%B0%80-%EB%B6%84%EC%84%9D</guid>
            <pubDate>Sun, 28 Jan 2024 04:31:51 GMT</pubDate>
            <description><![CDATA[<p>하루 온종일 풀어본 SQL 유가 분석</p>
<p>ctrl+shift+i</p>
<pre><code>import mysql.connector

conn = mysql.connector.connect(
    host = &quot;&quot;,
    port = 3306,
    user = &quot;oneday&quot;,
    password = &quot;1234&quot;,
    database = &quot;oneday&quot;
)

cursor = conn.cursor(buffered=True)</code></pre><p>1.</p>
<pre><code># gas_brand
sql_b = &quot;CREATE TABLE GAS_BRAND(&quot; + \
            &quot;id int not null auto_increment primary key, &quot; + \
            &quot;name varchar(16) not null)&quot;

cursor.execute(sql_b)
</code></pre><pre><code>
# gas_station
sql_s = &quot;CREATE TABLE GAS_STATION(&quot; + \
            &quot;id int auto_increment primary key, &quot; +\
            &quot;brand int not null, &quot; +\
            &quot;name varchar(64) not null, &quot; +\
            &quot;city char(2) not null, &quot; +\
            &quot;gu varchar(10) not null, &quot; +\
            &quot;address varchar(128) not null, &quot; +\
            &quot;gasoline int not null, &quot; +\
            &quot;diesel int not null, &quot; +\
            &quot;self boolean not null, &quot; +\
            &quot;car_wash boolean not null, &quot; +\
            &quot;charging_station boolean not null, &quot; +\
            &quot;car_maintenance boolean not null, &quot; +\
            &quot;convenience_store boolean not null, &quot; +\
            &quot;24_hours boolean not null, &quot; +\
            &quot;lat decimal(16,14) not null, &quot; +\
            &quot;lng decimal(17,14) not null, &quot; +\
            &quot;foreign key (brand) references GAS_BRAND(id));&quot;

cursor.execute(sql_s)</code></pre><p>2.</p>
<pre><code>queries1 = [
    (1, &#39;SK에너지&#39;),
    (2, &#39;HD현대오일뱅크&#39;),
    (3, &#39;GS칼텍스&#39;),
    (4, &#39;S-OIL&#39;),
    (5, &#39;알뜰주유소&#39;),
    (6, &#39;자가상표&#39;)
]

query = &quot;INSERT INTO GAS_BRAND VALUES (%s, %s)&quot;
cursor.executemany(query, queries1)

conn.commit()</code></pre><pre><code>sql_result = &quot;DESC GAS_STATION&quot;
cursor.execute(sql_result)

result = cursor.fetchall()
for i in result:
    print(i)</code></pre><pre><code>sql_result = &quot;SELECT * FROM GAS_BRAND&quot;
cursor.execute(sql_result)

result = cursor.fetchall()
for i in result:
    print(i)</code></pre><p>3.</p>
<pre><code># 화폐단위 문자형 &gt;&gt;  숫자형 
def stringToInt(s):
    if s != &#39;&#39;:
        s = s.replace(&#39;,&#39;, &#39;&#39;)
        return int(s)
    else: 
        return None

stringToInt(&#39;1,000&#39;)</code></pre><pre><code># 주유소 브랜드를 입력하면 GAS_BRAND 데이터를 참고하여 ID 반환
def getID(brand):
    sql_result = &quot;SELECT * FROM GAS_BRAND&quot;
    cursor.execute(sql_result)
    result = cursor.fetchall()
    for i in result:
        if i[1] == brand:
            return i[0]
        # 브랜드명이 &#39;알뜰(ex)&#39;인 경우 있음
        elif brand == &#39;알뜰(ex)&#39;:
            return 5

getID(&#39;SK에너지&#39;)</code></pre><pre><code># 주소를 입력받아 구 이름 반환
def getGu(add):
    addList = add.split()
    return addList[1]

getGu(&#39;서울시 강남구 헌릉로 730&#39;)</code></pre><pre><code>import googlemaps
gmaps_key = &#39;AIzaSyALyv5xMRzF_RJUIeJ84qh25GgNWoIJ8LM&#39;
gmaps = googlemaps.Client(key = gmaps_key)

# 주소를 입력받아 위도, 경도 반환
def getLL(add):
    tmp = gmaps.geocode(add, language=&#39;ko&#39;)
    lat = tmp[0].get(&quot;geometry&quot;)[&quot;location&quot;][&quot;lat&quot;]
    lng = tmp[0].get(&quot;geometry&quot;)[&quot;location&quot;][&quot;lng&quot;]

    return lat, lng

getLL(&#39;서울시 강남구 헌릉로 730&#39;)</code></pre><p>4.</p>
<pre><code>import time 
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

# 오피넷 -&gt; 구 정보 가져오기
url = &#39;https://www.opinet.co.kr/searRgSelect.do&#39;
driver = webdriver.Chrome(executable_path=&#39;../driver/chromedriver.exe&#39;)
driver.get(url)
</code></pre><pre><code># 시/도
sido_list_raw = driver.find_element(By.ID, &quot;SIDO_NM0&quot;)
sido_list = sido_list_raw.find_elements(By.TAG_NAME, &quot;option&quot;)</code></pre><pre><code># 서울 선택
seoul_select = sido_list[1].get_attribute(&quot;value&quot;)
sido_list_raw.send_keys(seoul_select)</code></pre><pre><code># 구 리스트 만들기
gu_list_raw = driver.find_element(By.ID, &quot;SIGUNGU_NM0&quot;)
gu_list = gu_list_raw.find_elements(By.TAG_NAME, &quot;option&quot;)

gu_names = [option.get_attribute(&quot;value&quot;) for option in gu_list]
gu_names = gu_names[1:]</code></pre><pre><code>sql = &quot;INSERT INTO GAS_STATION (brand, name, city, gu, address, gasoline, diesel, self, &quot; +\
        &quot;car_wash, charging_station, car_maintenance, convenience_store, 24_hours, lat, lng) &quot; +\
        &quot;VALUES (%s, %s, &#39;서울&#39;, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)&quot;</code></pre><pre><code>def check(data, tag):
    return &#39;off&#39; not in data.select_one(tag)[&#39;src&#39;]</code></pre><pre><code>sqltmp = &quot;ALTER TABLE GAS_STATION MODIFY diesel int NULL;&quot;
cursor.execute(sqltmp)
conn.commit()</code></pre><pre><code>for gu in tqdm_notebook(gu_names):
    element = driver.find_element(By.ID, &#39;SIGUNGU_NM0&#39;)
    element.send_keys(gu)
    time.sleep(0.5)

    html = driver.page_source
    soup = BeautifulSoup(html, &#39;html.parser&#39;)

    # 검색할 주유소 개수
    cnt = int(driver.find_element(By.ID, &#39;totCnt&#39;).text)

    for i in range(1, cnt+1):

        # 각 주유소 클릭
        station = driver.find_element(By.CSS_SELECTOR, f&#39;#body1 &gt; tr:nth-child({i}) &gt; td.rlist &gt; a&#39;)
        station.click()

        html = driver.page_source
        soup = BeautifulSoup(html, &#39;html.parser&#39;)

        data = soup.select(&#39;#os_dtail_info&#39;)[0]

        # brand
        brand = getID(data.select_one(&#39;#poll_div_nm&#39;).text)

        # name
        name = data.select_one(&#39;.header&#39;).text.strip()

        # address
        address = data.select_one(&#39;#rd_addr&#39;).text

        # gasoline
        gasoline = stringToInt(data.select_one(&#39;#b027_p&#39;).text)

        # diesel
        diesel = stringToInt(data.select_one(&#39;#d047_p&#39;).text)

        # self 
        slf = data.select_one(&#39;#SPAN_SELF_VLT_YN_ID&#39;)
        if type(slf.find(&#39;img&#39;)) == type(None):
            is_self = False
        else:
            is_self = True

        # car_wash
        car_wash = check(data, &#39;#cwsh_yn&#39;)

        # charging_station
        charging_station = check(data, &#39;#lpg_yn&#39;)

        # car_maintenance
        car_maintenance = check(data, &#39;#maint_yn&#39;)

        # convenience_store
        convenience_store = check(data, &#39;#cvs_yn&#39;)

        # 24_hours
        sel24 = check(data, &#39;#sel24_yn&#39;)

        tmp = gmaps.geocode(address, language=&#39;ko&#39;)
        # lat
        lat = tmp[0].get(&#39;geometry&#39;)[&#39;location&#39;][&#39;lat&#39;]

        # lng
        lng = tmp[0].get(&#39;geometry&#39;)[&#39;location&#39;][&#39;lng&#39;]

        cursor.execute(sql, (brand, name, gu, address, gasoline, diesel, 
                            is_self, car_wash, charging_station, car_maintenance, convenience_store, sel24, lat, lng))

        conn.commit()</code></pre><pre><code># 데이터 개수 확인
cursor.execute(&quot;select count(*) from GAS_STATION&quot;)
result = cursor.fetchall()
print(result[0])</code></pre><pre><code># 데이터 상위 10개 출력
cursor.execute(&quot;select * from GAS_STATION limit 10&quot;)
result = cursor.fetchall()
for i in result:
    print(i)</code></pre><p>5.</p>
<pre><code>import pandas as pd

sql = &quot;select s.id, b.name &#39;brand&#39;, s.name, s.city, s.gu, s.address, s.gasoline, s.diesel, s.self, &quot; +\
        &quot;s.car_wash, s.charging_station, s.car_maintenance, s.convenience_store, s.24_hours, &quot; +\
        &quot;s.lat, s.lng &quot; +\
        &quot;from GAS_BRAND b, GAS_STATION s &quot; +\
        &quot;where b.id = s.brand ORDER BY s.id&quot;

cursor.execute(sql)
result = cursor.fetchall()

columns = [i[0] for i in cursor.description]</code></pre><pre><code>df = pd.DataFrame(data=result, columns=columns)
df.to_csv(&quot;[DS]sql2_oilstation_ohjaemin.csv&quot;, index=False, encoding=&#39;euc-kr&#39;)</code></pre><pre><code>df = pd.read_csv(&quot;[DS]sql2_chasuhui.csv&quot;,  index_col=0, thousands=&#39;,&#39;, encoding=&#39;euc-kr&#39;)
df.head(10)</code></pre><p>6.</p>
<pre><code># 미왕빌딩 주소: 서울 강남구 강남대로 364
lat, lng = getLL(&#39;서울 강남구 강남대로 364&#39;)
lat, lng</code></pre><pre><code># POINT(경도, 위도)
# SET @location = POINT(경도, 위도) : 기준이 되는 위치 설정
# ST_DISTANCE_SPHERE(POINT, POINT) : 두 좌표 간 거리(단위: m)

cursor.execute(&quot;SET @location = POINT(127.029340, 37.495599)&quot;)

cursor.execute(&quot;SELECT * FROM (SELECT s.id id, b.name brand, s.name name, address, \
    ST_DISTANCE_SPHERE(@location, POINT(lng, lat))/1000 distance \
    FROM GAS_BRAND b, GAS_STATION s WHERE b.id = s.brand) t \
    WHERE distance*1000 &lt;= 1000&quot;)

result = cursor.fetchall()
for row in result:
    print(row)</code></pre><p>7.</p>
<pre><code>cursor.execute(&quot;SELECT * FROM (SELECT s.id id, b.name brand, s.name name, address, \
    gasoline, self, 24_hours, convenience_store, \
    ST_DISTANCE_SPHERE(@location, POINT(lng, lat))/1000 distance \
    FROM GAS_BRAND b, GAS_STATION s \
    WHERE b.id = s.brand and self = 1 and 24_hours = 1 and convenience_store = 1 \
    ORDER BY distance LIMIT 10) t \
    ORDER BY gasoline&quot;)

result = cursor.fetchall()
for row in result:
    print(row)</code></pre><p>8.</p>
<pre><code>cursor.execute(&quot;SELECT gu, b.name brand, avg(gasoline) avg_price \
    FROM GAS_BRAND b, GAS_STATION s \
    WHERE b.id = s.brand GROUP BY gu, brand ORDER BY avg_price&quot;)

result = cursor.fetchall()
for row in result:
    print(row)</code></pre><pre><code>conn.close()
driver.quit()</code></pre>]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 10. 앙상블 기법 - H
AR, Human Activity Recognition - 센서를 활용한  행동인식 실험]]></title>
            <link>https://velog.io/@jaam_mini/ML-10.-%EC%95%99%EC%83%81%EB%B8%94-%EA%B8%B0%EB%B2%95-HAR-Human-Activity-Recognition-%EC%84%BC%EC%84%9C%EB%A5%BC-%ED%99%9C%EC%9A%A9%ED%95%9C-%ED%96%89%EB%8F%99%EC%9D%B8%EC%8B%9D-%EC%8B%A4%ED%97%98</link>
            <guid>https://velog.io/@jaam_mini/ML-10.-%EC%95%99%EC%83%81%EB%B8%94-%EA%B8%B0%EB%B2%95-HAR-Human-Activity-Recognition-%EC%84%BC%EC%84%9C%EB%A5%BC-%ED%99%9C%EC%9A%A9%ED%95%9C-%ED%96%89%EB%8F%99%EC%9D%B8%EC%8B%9D-%EC%8B%A4%ED%97%98</guid>
            <pubDate>Tue, 23 Jan 2024 16:05:50 GMT</pubDate>
            <description><![CDATA[<p>앙상블 ?
<img src="https://velog.velcdn.com/images/jaam_mini/post/c1dac867-ca28-45e1-a862-3f269907bd3d/image.png" alt=""></p>
<h3 id="📌-앙상블-기법의-voting">📌 앙상블 기법의 voting</h3>
<ul>
<li>전체 데이터 셋에서 각기 다른 알고리즘을 돌리는 것</li>
<li>아는 것 다 돌려보고 다수결에 의해서 최종결정 하겠다</li>
</ul>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/c5339e99-8cbf-4fe1-a464-23fe8da31fbf/image.png" alt=""></p>
<h3 id="📌-bagging-기법">📌 bagging 기법</h3>
<ul>
<li>bootstrapping : 중복을 허용해 샘플링 함</li>
<li>랜덤하게 샘플링된 데이터에 각각의 알고리즘을 붙여서 결과를 받아들임</li>
</ul>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/96513506-c5d4-4f3a-b3c1-d20f0e1945ef/image.png" alt=""></p>
<h3 id="📌-결정-방법에서의-하드보팅">📌 결정 방법에서의 하드보팅</h3>
<ul>
<li>우리가 아는 다수결</li>
<li>모두가 1인데 하나가 2면, 2는 제거하고 1을 선택</li>
</ul>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/d3c71e2a-4b94-449d-82b4-346c4f2ef2ff/image.png" alt=""></p>
<h3 id="📌-소프트보팅">📌 소프트보팅</h3>
<ul>
<li>동일한 값의 확률 평균을 구해서, 다른 값과 비교</li>
<li>동점일 시 [다수결]을 따르고</li>
<li>더 높은 값이 있다면, 높은 점수를 선택</li>
</ul>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/8343946b-8540-40d0-8577-a85586c0df0a/image.png" alt=""></p>
<h3 id="📌-랜덤포레스트">📌 랜덤포레스트</h3>
<ul>
<li>DecisionTree(결정나무) 여러개를 사용해서 투표하는 방식</li>
<li>bagging 기법의 대표적인 방법</li>
</ul>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/84e89490-6461-49fb-af42-2e922f706c34/image.png" alt=""></p>
<p></br></br></br></br></p>
<h1 id="har-human-activity-recognition">HAR, Human Activity Recognition</h1>
<hr>
<ul>
<li><p>IMU 센서를 활용해서 사람의 행동을 인식하는 실험</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/1581bbe2-98bc-4c12-821a-f406e64b37a2/image.png" alt=""></li>
</ul>
</li>
<li><p>폰에 있는 가속도/자이로 센서 사용</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/690cf0a2-b60b-4b1e-ba47-dcbb7fe05886/image.png" alt=""></li>
</ul>
</li>
<li><p>데이터 소개</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/ad21f8c8-4fee-4691-85a6-a4f8311343e5/image.png" alt=""></li>
</ul>
</li>
</ul>
<ul>
<li>데이터 특성<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/db6adeba-003d-4151-8c9f-7f0ebd2d002a/image.png" alt=""></li>
</ul>
</li>
</ul>
<ul>
<li>데이터 클래스<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/b703be24-3111-42f7-baf6-6612c26927f7/image.png" alt=""></li>
</ul>
</li>
</ul>
<p>1) 데이터 읽기</p>
<pre><code>import pandas as pd
import matplotlib.pyplot as plt

# txt 파일
url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt&#39;

# &#39;\s+&#39; 공백, header 그대로, 컬럼 이름 names
feature_name_df = pd.read_csv(url, sep=&#39;\s+&#39;, header=None, names=[&#39;columns_index&#39;,&#39;columns_name&#39;])
feature_name_df.head()</code></pre><p>2) 특성(feature) 갯수 확인</p>
<pre><code>len(feature_name_df)</code></pre><p>3) 데이터 확인</p>
<pre><code># 밸류만 가지고 feature_name 추출 -&gt; 즉, 앞으로 561개의 이름만 저장하게 됨
feature_name = feature_name_df.iloc[:, 1].values.tolist()
feature_name[:10]</code></pre><p>4) 일단 X데이터만</p>
<pre><code>X_train_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/X_train.txt&#39;
X_test_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/X_test.txt&#39;

X_train = pd.read_csv(X_train_url, sep=&#39;\s+&#39;, header=None)
X_test = pd.read_csv(X_test_url, sep=&#39;\s+&#39;, header=None)</code></pre><p>5) 대용량 데이터 컬럼 확인</p>
<pre><code>X_train.columns = feature_name
X_test.columns = feature_name
X_train.head()</code></pre><p>6) y 데이터 읽어오기</p>
<pre><code>y_train_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/y_train.txt&#39;
y_test_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/y_test.txt&#39;

y_train = pd.read_csv(y_train_url, sep=&#39;\s+&#39;, header=None, names=[&#39;action&#39;])
y_test = pd.read_csv(y_test_url, sep=&#39;\s+&#39;, header=None, names=[&#39;action&#39;])</code></pre><p>7) shape - 개수 확인</p>
<pre><code>X_train.shape, X_test.shape, y_train.shape, y_test.shape</code></pre><p>8) 각 action 별 데이터 수</p>
<pre><code>y_train[&#39;action&#39;].value_counts()</code></pre><ul>
<li>action
6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: count, dtype: int64</li>
</ul>
<p>9) DecisionTree 결정나무</p>
<pre><code>from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
dt_clf.fit(X_train, y_train)

pred = dt_clf.predict(X_test)
accuracy_score(y_test, pred)</code></pre><ul>
<li>0.8096369189005769</li>
</ul>
<p>10) Train - GridSearchCV (max_depth를 다양하게 하기 위해)</p>
<pre><code>from sklearn.model_selection import GridSearchCV

params = {
    &#39;max_depth&#39; : [6,8,10,12,16,20,24]
}

# scoring=&#39;accuracy&#39;: accuracy 계열은 기록해주세요
# cv=5  :KFold는 5개
grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring=&#39;accuracy&#39;, cv=5, return_train_score=True)
grid_cv.fit(X_train, y_train)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/da6e6514-c38d-4a90-8d55-52f6903d788e/image.png" alt=""></li>
</ul>
<p>11) Train 밸리데이션한 값의 best score &amp; params 확인</p>
<pre><code>grid_cv.best_score_, grid_cv.best_params_</code></pre><p>(0.8543335321892183, {&#39;max_depth&#39;: 8})</p>
<p>12) Train max_depth별로 표로 성능을 정리</p>
<pre><code>cv_result_df = pd.DataFrame(grid_cv.cv_results_)
cv_result_df[[&#39;param_max_depth&#39;, &#39;mean_test_score&#39;, &#39;mean_train_score&#39;]]</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/ada15eae-2493-43d2-967a-ee32d99c3c9b/image.png" alt=""></li>
</ul>
<p>13) Test 데이터에서의 결과</p>
<pre><code>max_depth = [6,8,10,12,16,20,24]

for depth in max_depth:
    dt_clf = DecisionTreeClassifier(max_depth=depth, random_state=156)
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print(&#39;Max_Depth =&#39;, depth, &#39;, Accuracy =&#39;, accuracy)</code></pre><ul>
<li>Max_Depth = 6 , Accuracy = 0.8557855446216491
Max_Depth = 8 , Accuracy = 0.8707159823549372
Max_Depth = 10 , Accuracy = 0.8673227010519172
Max_Depth = 12 , Accuracy = 0.8646080760095012
Max_Depth = 16 , Accuracy = 0.8574821852731591
Max_Depth = 20 , Accuracy = 0.8547675602307431
Max_Depth = 24 , Accuracy = 0.8547675602307431</li>
</ul>
<p>14) Test - 베스트 모델의 결과는</p>
<pre><code>best_dt_clf = grid_cv.best_estimator_
pred1 = best_dt_clf.predict(X_test)
accuracy_score(y_test, pred1)</code></pre><p>0.8734306073973532</p>
<p>15) 랜덤포레스트 적용</p>
<pre><code>from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

params = {
    &#39;max_depth&#39; : [6,8,10], # DecisionTree에 적용할 파라미터
    &#39;n_estimators&#39; : [50,100,200], # DecisionTree에 tree 몇개
    &#39;min_samples_leaf&#39; : [8,12], # DecisionTree에 맨끝 데이터(leaf) 최소 몇개
    &#39;min_samples_split&#39; : [8,12] # 분할 기준에서 최소한으로 남는 데이터 수 (큰영향 X)
}

rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1) # n_jobs=-1 : cpu core 다 써서
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/5b6e21d7-a8eb-4748-957d-6846451c590f/image.png" alt=""></li>
</ul>
<p>16) 결과 정리</p>
<pre><code>cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_result_df.columns</code></pre><ul>
<li>Index([&#39;mean_fit_time&#39;, &#39;std_fit_time&#39;, &#39;mean_score_time&#39;, &#39;std_score_time&#39;,<pre><code> &#39;param_max_depth&#39;, &#39;params&#39;, &#39;split0_test_score&#39;, &#39;split1_test_score&#39;,
 &#39;split2_test_score&#39;, &#39;split3_test_score&#39;, &#39;split4_test_score&#39;,
 &#39;mean_test_score&#39;, &#39;std_test_score&#39;, &#39;rank_test_score&#39;,
 &#39;split0_train_score&#39;, &#39;split1_train_score&#39;, &#39;split2_train_score&#39;,
 &#39;split3_train_score&#39;, &#39;split4_train_score&#39;, &#39;mean_train_score&#39;,
 &#39;std_train_score&#39;],
dtype=&#39;object&#39;)</code></pre></li>
</ul>
<p>17) target_cols 지정 &amp; 순위 메기기</p>
<pre><code># mean_test_score : train 데이터의 validation score (아까는 85% 였는데, randomforest하니까 90% 이상)
# param_n_estimators : 몇개의 나무
target_cols = [&#39;rank_test_score&#39;, &#39;mean_test_score&#39;, &#39;param_n_estimators&#39;, &#39;param_max_depth&#39;]

cv_results_df[target_cols].sort_values(&#39;rank_test_score&#39;).head()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/277927ef-4661-4e73-a9a8-712b20bd378c/image.png" alt=""></li>
</ul>
<p>18) best 찾기</p>
<pre><code>grid_cv.best_params_, grid_cv.best_score_</code></pre><ul>
<li>({&#39;max_depth&#39;: 10,
&#39;min_samples_leaf&#39;: 8,
&#39;min_samples_split&#39;: 8,
&#39;n_estimators&#39;: 100},
0.9151251360174102)</li>
</ul>
<p>19) Test 데이터에 적용</p>
<pre><code>rf_clf_best = grid_cv.best_estimator_
rf_clf_best.fit(X_train, y_train)

pred1 = rf_clf_best.predict(X_test)
accuracy_score(y_test, pred1)</code></pre><p>0.9205972175093315</p>
<p>20) 중요특성 확인</p>
<pre><code># 베스트 모델에서 랜덤포레스트를 반환(feature_importances_) 받아서
best_cols_values = rf_clf_best.feature_importances_ 

# 영향력이 높은(best_cols_values) feature만 추려서
best_cols = pd.Series(best_cols_values, index=X_train.columns)

# 정렬(sort_values) 한 다음에 20개만 출력
top20_cols = best_cols.sort_values(ascending=False)[:20]
top20_cols</code></pre><ul>
<li>angle(X,gravityMean)               0.034638
tGravityAcc-max()-Y                0.032518
tGravityAcc-energy()-X             0.031309
tGravityAcc-mean()-X               0.029513
tGravityAcc-min()-X                0.027775
tGravityAcc-max()-X                0.027662
angle(Y,gravityMean)               0.026553
tGravityAcc-mean()-Y               0.026052
tGravityAcc-min()-Y                0.023037
tGravityAcc-energy()-Y             0.018678
tGravityAcc-mean()-Z               0.015688
angle(Z,gravityMean)               0.012837
fBodyAcc-mad()-X                   0.012558
tBodyAcc-max()-X                   0.011970
fBodyAccJerk-bandsEnergy()-1,24    0.011803
tBodyAccJerk-entropy()-X           0.011647
tGravityAccMag-std()               0.011451
tBodyAccJerk-energy()-X            0.011333
tGravityAcc-arCoeff()-Z,1          0.011257
fBodyAccJerk-max()-X               0.011040
dtype: float64</li>
</ul>
<p>21) 주요 특성 확인</p>
<pre><code>import seaborn as sns

plt.figure(figsize=(8, 8))
sns.barplot(x=top20_cols, y=top20_cols.index)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/0ec4eb96-5635-4c65-96b4-6eab46e298d1/image.png" alt=""></li>
</ul>
<p>22) 주요 특성 20 가지</p>
<ul>
<li>561개를 굳이 다써야 하나? 아래 그래프를 보아 하니 주요 특성 몇가지만 가지고 봐도 될 것 같음</li>
<li>따라서 20개로 보고자 함<pre><code>top20_cols.index</code></pre></li>
<li>Index([&#39;angle(X,gravityMean)&#39;, &#39;tGravityAcc-max()-Y&#39;, &#39;tGravityAcc-energy()-X&#39;,<pre><code> &#39;tGravityAcc-mean()-X&#39;, &#39;tGravityAcc-min()-X&#39;, &#39;tGravityAcc-max()-X&#39;,
 &#39;angle(Y,gravityMean)&#39;, &#39;tGravityAcc-mean()-Y&#39;, &#39;tGravityAcc-min()-Y&#39;,
 &#39;tGravityAcc-energy()-Y&#39;, &#39;tGravityAcc-mean()-Z&#39;,
 &#39;angle(Z,gravityMean)&#39;, &#39;fBodyAcc-mad()-X&#39;, &#39;tBodyAcc-max()-X&#39;,
 &#39;fBodyAccJerk-bandsEnergy()-1,24&#39;, &#39;tBodyAccJerk-entropy()-X&#39;,
 &#39;tGravityAccMag-std()&#39;, &#39;tBodyAccJerk-energy()-X&#39;,
 &#39;tGravityAcc-arCoeff()-Z,1&#39;, &#39;fBodyAccJerk-max()-X&#39;],
dtype=&#39;object&#39;)</code></pre></li>
</ul>
<p>23) 20개 특성으로 다시 확인</p>
<pre><code>X_train_re = X_train[top20_cols.index]
X_test_re = X_test[top20_cols.index]

rf_clf_best_re = grid_cv.best_estimator_
rf_clf_best_re.fit(X_train_re, y_train.values.reshape(-1, ))

pred1_re = rf_clf_best_re.predict(X_test_re)

accuracy_score(y_test, pred1_re)</code></pre><p>0.8177807940278249</p>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 9. Precision(정밀도) and Recall(재현율)]]></title>
            <link>https://velog.io/@jaam_mini/ML-9.-Precision%EC%A0%95%EB%B0%80%EB%8F%84-and-Recall%EC%9E%AC%ED%98%84%EC%9C%A8</link>
            <guid>https://velog.io/@jaam_mini/ML-9.-Precision%EC%A0%95%EB%B0%80%EB%8F%84-and-Recall%EC%9E%AC%ED%98%84%EC%9C%A8</guid>
            <pubDate>Tue, 23 Jan 2024 15:39:44 GMT</pubDate>
            <description><![CDATA[<p> Precision(정밀도) and Recall(재현율)</p>
<p> 1) 데이터 가져오기</p>
<pre><code class="language-py">import pandas as pd
wine_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv&#39;
wine = pd.read_csv(wine_url,index_col=0)
wine.head()</code></pre>
</br>

<p>2) 맛 등급 설정</p>
<pre><code class="language-py"># (1) quality 컬럼 이진화
# wine 데이터의 [&#39;taste&#39;] 컬럼 생성
# wine의 quality column울 grade로 잡고, 5등급 보다 크면 1, 그게 아니라면 0으로 잡음
wine[&#39;taste&#39;] = [1. if grade&gt;5 else 0. for grade in wine[&#39;quality&#39;]]
# (2) 모델링
# label인 taste, quality를 drop, 나머지를 X의 특성으로 봄
X = wine.drop([&#39;taste&#39;, &#39;quality&#39;], axis=1)
# 새로만들 y데이터
y = wine[&#39;taste&#39;]</code></pre>
</br>

<p>3) 데이터 분리</p>
<pre><code class="language-py">from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)</code></pre>
</br>

<p>4) 로지스틱 회귀</p>
<pre><code class="language-py">from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(solver=&#39;liblinear&#39;, random_state=13)
lr.fit(X_train, y_train)

y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print(&#39;Train Acc : &#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc : &#39;, accuracy_score(y_test, y_pred_test))</code></pre>
<p>Train Acc :  0.7429286126611506
Test Acc :  0.7446153846153846</p>
</br>

<p>5) classification report</p>
<pre><code class="language-py">from sklearn.metrics import classification_report
print(classification_report(y_test, lr.predict(X_test)))</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/21a17770-0eb3-4f9a-8788-9859fe358a68/image.png" alt=""></li>
</ul>
</br>

<p>6) confusion matrix</p>
<pre><code class="language-py">from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lr.predict(X_test))</code></pre>
<p>array( [ [ 275, 202 ] , [ 130, 693 ] ] , dtype=int64 )</p>
<ul>
<li>0라인 | 0이라고 한 갯수, 1이라고 한 갯수 : [275,202]</li>
<li>1라인 | 0이라고 한 갯수, 1이라고 한 갯수 : [130,693]</li>
</ul>
</br>

<p>7) precision_recall curve</p>
<pre><code class="language-py">import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

# predict_proba : class 별 확률을 구해주기 떄문에 1일 때 확률을 가져옴
pred = lr.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, pred)

plt.figure(figsize=(6, 3))
# thresholds를 기준으로, precisions그래프를 그림
# :len(thresholds) : thresholds의 크기 만큼 그리겠다
plt.plot(thresholds, precisions[:len(thresholds)], label=&#39;precision&#39;)
plt.plot(thresholds, recalls[:len(thresholds)], label=&#39;recall&#39;)
plt.grid()
plt.legend()
plt.show()</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/ba8d946e-f3f6-4cb8-9463-7fe1ecdadab7/image.png" alt=""></li>
</ul>
</br>

<p>8) threshlod = 0.5</p>
<ul>
<li>threshlod 값을 따로 정해주지 않으면 0.5가 디폴트값 임<pre><code class="language-py"># lr(분류기)에서 predict_proba를 X_test에 대해서 해라
pred_proba = lr.predict_proba(X_test)
# 앞부분 3개만 보고싶음
pred_proba[:3]</code></pre>
array
([[0.40472417, 0.59527583],<pre><code> [0.51002386, 0.48997614],
 [0.10222708, 0.89777292]])</code></pre></li>
</ul>
</br>

<p>9) 간단히 확인해보기</p>
<ul>
<li>1_ y_pred_test을 pred_proba 옆으로 붙인 데이터를 만들고 싶음</li>
<li>2_그래서 reshape을 통해 y_pred_test 먼저 틀을 만들어 줌 (리스트 안 리스트)<ul>
<li>.reshape(-1,1) : reshape(니가 알아서해, 마지막만 1로 만들어줘)</li>
</ul>
</li>
<li>3_ np.concatenate을 이용해 둘을 붙여 줌<pre><code class="language-py">import numpy as np
</code></pre>
</li>
</ul>
<p>np.concatenate([pred_proba, y_pred_test.reshape(-1,1)], axis=1)</p>
<pre><code>
&lt;/br&gt;

10) Binarizer
threshold를 사용자의 지정을 받아서 0과 1을 바꿔주는 명령어

```py
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.6).fit(pred_proba)
pred_bin = binarizer.transform(pred_proba)[:,1]
pred_bin</code></pre><p>array([0., 0., 1., ..., 1., 0., 1.])</p>
</br>

<p>11) 다시 classification report</p>
<pre><code>from sklearn.metrics import classification_report
print(classification_report(y_test, pred_bin))</code></pre></br>

<p>11) 다시 classification report</p>
<pre><code>from sklearn.metrics import classification_report
print(classification_report(y_test, pred_bin))</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/04dc58c2-c63a-4919-afec-b90b159ce42f/image.png" alt=""></li>
</ul>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 8. (분류)Logistic Regression - PIMA 인디언 당뇨병 예측]]></title>
            <link>https://velog.io/@jaam_mini/ML-8.-Logistic-Regression-PIMA-%EC%9D%B8%EB%94%94%EC%96%B8-%EB%8B%B9%EB%87%A8%EB%B3%91-%EC%98%88%EC%B8%A1</link>
            <guid>https://velog.io/@jaam_mini/ML-8.-Logistic-Regression-PIMA-%EC%9D%B8%EB%94%94%EC%96%B8-%EB%8B%B9%EB%87%A8%EB%B3%91-%EC%98%88%EC%B8%A1</guid>
            <pubDate>Tue, 23 Jan 2024 06:07:11 GMT</pubDate>
            <description><![CDATA[<h2 id="logistic-regression을-쓰는-이유--💡분류기-역할">Logistic Regression을 쓰는 이유 : 💡분류기 역할</h2>
<p>즉, linear regression (선형회귀)을 분류에 적용한 것이 Logistic Regression (로지스틱 회귀)이다.</p>
<p></br></br></p>
<h1 id="lr-이론">LR 이론</h1>
<hr>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/aa4f7694-978c-43ab-af82-6e3b9c89cee4/image.png" alt=""></li>
</ul>
<p>악성 종양을 찾는다고 가정하자.
linear regression (선형회귀)에 적용한다면 0과 1밖에 없어서 수 많은 데이터를 분류하기가 어려 움.
보이지 않는 데이터가 멀-리 있다면 확인이 어려움</p>
<p>출력이 0과 1사이에 위치하게 하는 [시그모이드]에 linear regression() 함수를 넣으면 = &quot;직선&quot;이 됨</p>
<h3 id="📌-sigmoid-function">📌 sigmoid (function)</h3>
<p>기울어진 S자 형태의 곡선 </br></p>
<h4 id="linear-regression에서-sigmoid를-재정의">linear regression에서 sigmoid를 재정의</h4>
<pre><code>  import numpy as np
  import matplotlib.pyplot as plt

  # np의 arrange 명령으로 (-10 ~ 10 까지, 0.01 간격)
  z = np.arange(-10,10,0.01)
  g = 1/(1+np.exp(-z))

  plt.plot(z,g);</code></pre><ul>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/121dee22-cc8d-4c83-94ae-23e1e9c3e99e/image.png" alt=""></p>
</li>
<li><p>그래프 멋내기</p>
<pre><code>import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))
ax = plt.gca()

ax.plot(z,g)
ax.spines[&#39;left&#39;].set_position(&#39;zero&#39;)
ax.spines[&#39;bottom&#39;].set_position(&#39;center&#39;)
ax.spines[&#39;right&#39;].set_color(&#39;none&#39;)
ax.spines[&#39;top&#39;].set_color(&#39;none&#39;)</code></pre></li>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/e9268cea-5cb0-45c9-8339-630d99692f8b/image.png" alt=""></p>
</li>
</ul>
<h3 id="📌-cost-function-funtion">📌 Cost Function (funtion)</h3>
<h4 id="logistic-regression에서-cost-function을-재정의">Logistic Regression에서 Cost Function을 재정의</h4>
<ul>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/ce4343a4-4171-4d73-9d5c-ed399dfe780c/image.png" alt=""></p>
<pre><code class="language-h">
c0 = -np.log(1-h)
c1 = -np.log(h)

plt.figure(figsize=(7,3))
plt.plot(h, c0, label=&#39;y=0&#39;)
plt.plot(h, c1, label=&#39;y=1&#39;)
plt.legend()

plt.show()
</code></pre>
</li>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/c1034094-626b-4d17-80d6-a080d0044b57/image.png" alt=""></p>
</li>
</ul>
<p></br></br></br></br></br></p>
<h1 id="와인-분석">와인 분석</h1>
<hr>
<p>1) 데이터 가져오기</p>
<pre><code class="language-py">import pandas as pd

wine_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv&#39;
wine = pd.read_csv(wine_url,index_col=0)
wine.head()</code></pre>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/8c2c0dad-e886-4d93-a52f-ddceb936bcc4/image.png" alt=""></li>
</ul>
<p>2) 맛 등급 설정</p>
<pre><code class="language-py"># (1) quality 컬럼 이진화
# wine 데이터의 [&#39;taste&#39;] 컬럼 생성
# wine의 quality column울 grade로 잡고, 5등급 보다 크면 1, 그게 아니라면 0으로 잡음
wine[&#39;taste&#39;] = [1. if grade&gt;5 else 0. for grade in wine[&#39;quality&#39;]]



# (2) 모델링
# label인 taste, quality를 drop, 나머지를 X의 특성으로 봄
X = wine.drop([&#39;taste&#39;, &#39;quality&#39;], axis=1)

# 새로만들 y데이터
y = wine[&#39;taste&#39;]</code></pre>
<p>3) 데이터 분리</p>
<pre><code class="language-py">from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
</code></pre>
<p>4) 로지스틱 회귀</p>
<pre><code class="language-py"># 분류기
from sklearn.linear_model import LogisticRegression

# 성능
from sklearn.metrics import accuracy_score

# solver(최적화 알고리즘) = liblinear(데이터 수가 작으면 보통 이걸로 선택)
lr = LogisticRegression(solver=&#39;liblinear&#39;, random_state=13)

# 학습 (train, train)
lr.fit(X_train, y_train)

# 예측 = 학습이 완료된 lr에게 시킴
y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

# 성능 확인
print(&#39;Train Acc :&#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc :&#39;, accuracy_score(y_test, y_pred_test))</code></pre>
<p>Train Acc : 0.7429286126611506
Test Acc : 0.7446153846153846</p>
<p>5) 파이프라인 구축 (스케일러 적용)</p>
<pre><code class="language-py"># Pipeline
from sklearn.pipeline import Pipeline
# StandardScaler
from sklearn.preprocessing import StandardScaler

# 평가 변수
estimators = [
    # 표준화(scaler)
    (&#39;scaler&#39;, StandardScaler()),
    # 분류기(clf)
    (&#39;clf&#39;, LogisticRegression(solver=&#39;liblinear&#39;, random_state=13))
]

pipe = Pipeline(estimators)</code></pre>
<p>6) 학습, 예측, 성능 확인</p>
<pre><code class="language-py"># 학습
pipe.fit(X_train, y_train)

# 예측 = 학습이 완료된 lr에게 시킴
y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

# 성능 확인
print(&#39;Train Acc :&#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc :&#39;, accuracy_score(y_test, y_pred_test))</code></pre>
<p>Train Acc : 0.7444679622859341
Test Acc : 0.7469230769230769</p>
<p>7) Decision Tree와 비교</p>
<pre><code class="language-py">from sklearn.tree import DecisionTreeClassifier

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)

models = {&#39;LogisticRegression&#39; : pipe, &#39;DecisionTree&#39; : wine_tree}</code></pre>
<p>8) AUC 그래프로 비교 확인</p>
<ul>
<li>thresholds(임계값) 보다 크면 양성, 작으면 음성</li>
<li>모델은 분류에서 확률(0<del>1) 또는 음수</del>양수 사이의 실수를 예측값으로 출력</li>
<li>sklearn에서는 predict_proba을 제공</li>
<li>predict_proba : 0.5 이상이면 1로 예측</li>
</ul>
<pre><code class="language-py"># roc_curve
from sklearn.metrics import roc_curve

plt.figure(figsize=(10,8))
plt.plot([0,1], [0,1])

# model_name : LogisticRegression, DecisionTree
# model : pipe, wine_tree
for model_name, model in models.items():
    # 첫번째 커럼은 0일 확률, 두번쨰 컬럼은 1일 확률이라서 [:, 1]
    # predict_proba : 0.5 이상이면 1로 예측
    pred = model.predict_proba(X_test)[:, 1]
    # roc_curve의 thresholds (임계값)
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    plt.plot(fpr, tpr, label=model_name)

plt.grid()
plt.legend()
plt.show()</code></pre>
<p>LogisticRegression의 결과가 더 좋은 것으로 확인 됨</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/7ad71ae4-58c4-4931-a4c3-78fb2061a317/image.png" alt=""></li>
</ul>
<p></br></br></br></br></br></p>
<h1 id="pima-인디언-당뇨병-예측-분석">PIMA 인디언 당뇨병 예측 분석</h1>
<hr>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/b40ef3d2-b43b-45fb-baec-ae4d577655e3/image.png" alt=""></li>
</ul>
<p>1) 데이터 가져오기</p>
<pre><code>import pandas as pd

PIMA_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/diabetes.csv&#39;
PIMA = pd.read_csv(PIMA_url)
PIMA.head()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/66ff99e0-3694-4062-b8b3-99c9bf78e00e/image.png" alt=""></li>
</ul>
<p>2) 데이터 확인</p>
<pre><code>PIMA.info()</code></pre><p>3) 데이터 전부 float 으로 변환 (astype)</p>
<pre><code>PIMA = PIMA.astype(&#39;float&#39;)
PIMA.info()</code></pre><p>4) 상관관계 확인</p>
<pre><code>import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))

# PIMA.corr() : PIMA의 상관계수()
sns.heatmap(PIMA.corr(), cmap=&#39;YlGnBu&#39;)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/3eb89699-84b2-4ca4-91a5-95caa4f05c67/image.png" alt=""></li>
</ul>
<p>5) 0인 데이터 확인 - 이상한 값들이 있는지 보기 위해</p>
<pre><code># (PIMA==0) : 0이 있는지 확인, T/F로 뜸
# (PIMA==0).astype(int) : T=1, F=0
# (PIMA==0).astype(int).sum() : 컬럼별로 0이 몇개 있는지 나옴
(PIMA== 0).astype(int).sum()</code></pre><h3 id="⭐-이상한-값결측치-해결">⭐ 이상한 값(결측치) 해결</h3>
<p>6) 이상한 값들은 평균값으로 대체 (replace)</p>
<pre><code># - 혈압(BloodPressure)은 0일수 없다...

zero_features = [&#39;Glucose&#39;, &#39;BloodPressure&#39;, &#39;SkinThickness&#39;, &#39;BMI&#39;]
PIMA[zero_features] = PIMA[zero_features].replace(0, PIMA[zero_features].mean())</code></pre><p>7) 데이터 분리</p>
<pre><code>X = PIMA.drop([&#39;Outcome&#39;], axis=1)
y = PIMA[&#39;Outcome&#39;]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=13, stratify=y)

estimators = [(&#39;scaler&#39;, StandardScaler()),
               (&#39;clf&#39;, LogisticRegression(solver=&#39;liblinear&#39;, random_state=13))]
pipe_lr = Pipeline(estimators)
pipe_lr.fit(X_train, y_train)
pred = pipe_lr.predict(X_test)</code></pre><p>8) 수치 확인</p>
<pre><code>from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             roc_auc_score, f1_score)

print(accuracy_score(y_test, pred))
print(recall_score(y_test, pred))
print(precision_score(y_test, pred))
print(roc_auc_score(y_test, pred))
print(f1_score(y_test, pred))</code></pre><p>9) 다변수 방정식의 각 계수 값 확인</p>
<pre><code>coeff = list(pipe_lr[&#39;clf&#39;].coef_[0])
labels = list(X_train.columns)</code></pre><p>10) feature 그리기</p>
<pre><code># DataFrame
features = pd.DataFrame({&#39;Features&#39;: labels, &#39;importance&#39;: coeff})
features.sort_values(by=[&#39;importance&#39;], ascending=True, inplace=True)

# positive 생성
features[&#39;positive&#39;] = features[&#39;importance&#39;] &gt; 0
features.set_index(&#39;Features&#39;, inplace=True)

# importance 를 그릴 것
features[&#39;importance&#39;].plot(kind=&#39;barh&#39;, figsize=(11, 6),
                            color=features[&#39;positive&#39;].map({True: &#39;blue&#39;, False: &#39;red&#39;}))
plt.xlabel(&#39;Importance&#39;)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a4ee8c5e-12d9-4e3d-888a-42a60338b2d1/image.png" alt=""></li>
</ul>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 7. Cost Function & Gradient Descent _ 보스턴 집값 예측(분석)]]></title>
            <link>https://velog.io/@jaam_mini/ML-7.-Cost-Function-Gradient-Descent</link>
            <guid>https://velog.io/@jaam_mini/ML-7.-Cost-Function-Gradient-Descent</guid>
            <pubDate>Mon, 22 Jan 2024 09:00:50 GMT</pubDate>
            <description><![CDATA[<h1 id="1-cost-function">1. Cost Function</h1>
<p><code>📌 에러를 표현하는 도구</code></p>
<ul>
<li>최소값 지점 찾기
<img src="https://velog.velcdn.com/images/jaam_mini/post/ddf32e76-42aa-41d2-9f46-fe525a205329/image.png" alt=""></li>
</ul>
<pre><code>import sympy as sym

# Symbol : 기호로 인식됨
theta = sym.Symbol(&#39;theta&#39;)

# diff : 미분하세요
diff_th = sym.diff(38*theta**2 - 94*theta + 62, theta)
diff_th</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/37d5db26-c935-42ed-9e70-30b1cf363444/image.png" alt=""><pre><code>94/76</code></pre></li>
<li>1.236842105263158 지점</li>
</ul>
<ul>
<li><p>데이터 = 모델</p>
<ul>
<li>에러는 &#39;0&#39;</li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/0bf83385-8949-425f-b392-d24877fc6439/image.png" alt=""></li>
</ul>
</li>
<li><p>데이터 != 모델</p>
<ul>
<li>에러가 &#39;증가&#39;</li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/e93461cd-9072-42f3-9d7b-473a1d07384a/image.png" alt=""></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/804156b8-9411-4599-8c14-1dcd18fdb3f6/image.png" alt=""></li>
</ul>
</li>
</ul>
<p></br></br></br></p>
<h1 id="2-gradient-descent">2. Gradient Descent</h1>
<hr>
<ul>
<li>개념 설명
<img src="https://velog.velcdn.com/images/jaam_mini/post/67dcefb6-eb63-454b-8ce9-386d980ae355/image.png" alt=""></li>
</ul>
<ul>
<li>즉 !
Gradient Descent 는 미분을 해서 어디로(오/왼) 가야할지 정하는 것</li>
</ul>
<p></br></br></br></p>
<h1 id="3-다변수-데이터에-대한-회귀">3. 다변수 데이터에 대한 회귀</h1>
<ul>
<li>feature : 여러개의 특성
<img src="https://velog.velcdn.com/images/jaam_mini/post/f6f66dbb-f41c-43b4-aead-207a4c78a0b1/image.png" alt=""></li>
</ul>
<p></br></br></br></p>
<h1 id="보스턴-집값-예측">보스턴 집값 예측</h1>
<p>the boston house-price data of Harrison, D. and Rubinfeld, D.L. &#39;Hedonic prices and the demand for clean air&#39;</p>
<hr>
<p>강의 시 제공해준 seaborn이 제대로 실행되지 않아, csv 파일로 진행</p>
</br>

<ol>
<li>데이터 읽기<pre><code>import pandas as pd
</code></pre></li>
</ol>
<p>boston_url = &#39;<a href="https://raw.githubusercontent.com/blackdew/tensorflow1/master/csv/boston.csv&#39;">https://raw.githubusercontent.com/blackdew/tensorflow1/master/csv/boston.csv&#39;</a>
boston = pd.read_csv(boston_url)</p>
<pre><code>
2. key값 확인</code></pre><p>boston.keys</p>
<pre><code>
3. 컬럼 확인</code></pre><p>boston.columns</p>
<pre><code>Index([&#39;crim&#39;, &#39;zn&#39;, &#39;indus&#39;, &#39;chas&#39;, &#39;nox&#39;, &#39;rm&#39;, &#39;age&#39;, &#39;dis&#39;, &#39;rad&#39;, &#39;tax&#39;,
       &#39;ptratio&#39;, &#39;b&#39;, &#39;lstat&#39;, &#39;medv&#39;],
      dtype=&#39;object&#39;)

4. 컬럼 예쁘게 확인</code></pre><p>[each for each in boston.columns]</p>
<pre><code>[&#39;crim&#39;,
 &#39;zn&#39;,
 &#39;indus&#39;,
 &#39;chas&#39;,
 &#39;nox&#39;,
 &#39;rm&#39;,
 &#39;age&#39;,
 &#39;dis&#39;,
 &#39;rad&#39;,
 &#39;tax&#39;,
 &#39;ptratio&#39;,
 &#39;b&#39;,
 &#39;lstat&#39;,
 &#39;medv&#39;]

5. 전체 데이터 확인</code></pre><p>boston</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/42bc1745-4b92-4599-8d4c-2565e2290a20/image.png)

6. 데이터 파악을 위해 pandas로 정리
- csv 파일을 불러와서 이미 df 임, 생략

7. [&#39;medv&#39;] -&gt; [&#39;price&#39;]
강의 자료와 동일시 하기 위해 컬럼명 변경

- [다양한 컬럼/인덱스 변경 방법](https://blog.naver.com/rising_n_falling/222061033231)</code></pre><h1 id="데이터-파악을-위해-pandas로-정리-csv로-이미-되어-있어서-생략">데이터 파악을 위해 pandas로 정리 (csv로 이미 되어 있어서 생략)</h1>
<h1 id="컬럼명-변경">컬럼명 변경</h1>
<p>boston.columns.values[13] = &#39;price&#39;
boston.head(2)</p>
<pre><code>
8. 상관계수 확인</code></pre><p>#모듈
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline</p>
<h1 id="상관계수-변수--df상관계수-함수소수점-첫쨰자리">상관계수 변수 = df.상관계수 함수().소수점 첫쨰자리</h1>
<p>corr_mat = boston.corr().round(1)</p>
<h1 id="사이즈-설정">사이즈 설정</h1>
<p>sns.set(rc={&#39;figure.figsize&#39;:(18,8)})</p>
<h1 id="히트맵-상관계수-변수-숫자를-기록해주세요-컬러">히트맵 (상관계수 변수, 숫자를 기록해주세요, 컬러)</h1>
<p>sns.heatmap(data=corr_mat, annot=True, cmap=&#39;bwr&#39;)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/9a5b7d70-bcdd-4c88-972c-e120f1568281/image.png)


9. RM과 LSTAT와 PRICE의 관계 보기</code></pre><p>sns.set_style(&#39;darkgrid&#39;)
sns.set(rc={&#39;figure.figsize&#39;:(18,8)})</p>
<h1 id="컬럼의-개수는-2개그래프-2개-그릴-것임">컬럼의 개수는 2개(그래프 2개 그릴 것임)</h1>
<p>fig, ax = plt.subplots(ncols=2)</p>
<h1 id="위에서-medv----price-컬럼명을-바꿨지만-그래프가-생성되지-않아-그대로-medv-사용">위에서 medv -&gt;  price 컬럼명을 바꿨지만, 그래프가 생성되지 않아, 그대로 [medv] 사용</h1>
<h1 id="regplot방수-가격-왼쪽">regplot(방수, 가격, 왼쪽)</h1>
<p>sns.regplot(x=&#39;rm&#39;, y=&#39;medv&#39;, data=boston, ax=ax[0])</p>
<h1 id="regplot하위계층-가격-왼쪽">regplot(하위계층, 가격, 왼쪽)</h1>
<p>sns.regplot(x=&#39;lstat&#39;, y=&#39;medv&#39;, data=boston, ax=ax[1])</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/949ad439-2ae7-4b1a-b461-a632843fb79d/image.png)

10. 데이터 나누기
</code></pre><p>from sklearn.model_selection import train_test_split</p>
<p>X = boston.drop(&#39;medv&#39;, axis=1)
y = boston[&#39;medv&#39;]</p>
<p>X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)</p>
<pre><code>
11. LinearRegression (학습)</code></pre><p>from sklearn.linear_model import LinearRegression</p>
<p>reg = LinearRegression()
reg.fit(X_train, y_train)</p>
<pre><code>
12. RMS (평가)</code></pre><p>import numpy as np
from sklearn.metrics import mean_squared_error</p>
<h1 id="predict">predict</h1>
<p>pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)</p>
<h1 id="mean_squared_error-선형-회귀-에서-주로-사용">mean_squared_error (선형 회귀 에서 주로 사용)</h1>
<h1 id="루트npsqrtmean_squared_error참값-pred_tr-예측값">루트(np.sqrt)(mean_squared_error(참값, pred_tr-예측값))</h1>
<p>rmse_tr = (np.sqrt(mean_squared_error(y_train, pred_tr)))
rmse_test = (np.sqrt(mean_squared_error(y_test, pred_test)))</p>
<p>print(&#39;RMSE of Train Data : &#39;, rmse_tr)
print(&#39;RMSE of Test Data : &#39;, rmse_test)</p>
<pre><code>RMSE of Train Data :  4.642806069019824
RMSE of Test Data :  4.9313525841467145


13. 성능 확인 (그래프로 확인)</code></pre><p>import matplotlib.pyplot as plt</p>
<h1 id="pltscatterx참값-y예측값">plt.scatter(x=참값, y=예측값)</h1>
<p>plt.scatter(y_test, pred_test)
plt.plot([0,48], [0,48], &#39;r&#39;)</p>
<p>plt.xlabel(&quot;Actual House Price ($1000)&quot;)
plt.ylabel(&quot;predicted Prices&quot;)
plt.title(&quot;Real vs Predicted&quot;)</p>
<p>plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/d8ca7acc-9546-466a-8ce8-dd5df072d2e5/image.png)









</code></pre>]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 6. Basic of Regression _ 회귀 기초]]></title>
            <link>https://velog.io/@jaam_mini/ML-6.-Basic-of-Regression-%ED%9A%8C%EA%B7%80-%EA%B8%B0%EC%B4%88</link>
            <guid>https://velog.io/@jaam_mini/ML-6.-Basic-of-Regression-%ED%9A%8C%EA%B7%80-%EA%B8%B0%EC%B4%88</guid>
            <pubDate>Mon, 22 Jan 2024 06:40:02 GMT</pubDate>
            <description><![CDATA[<p>지난 시간까지 배운 것 
▶ 지도학습 (라벨을 달아주는 것 = 정답을 알려주는것)</p>
<ol>
<li>데이터를 기반으로 하는 문제 해결 방법</li>
</ol>
<ul>
<li>문제 분석 &gt; 학습 시킴(데이터 계속 유입됨) &gt; 데이터를 베이스로 하기 때문에, 알고리즘 구현 &amp; 서비스 런칭 부분만 코딩으로 해결
<img src="https://velog.velcdn.com/images/jaam_mini/post/dc5cd382-5044-4159-9ba8-81d80b84afec/image.png" alt=""></li>
</ul>
<ol start="2">
<li>모델 스스로 
데이터 수집 &gt; 트레이닝 &gt; 업데이트(평가, 런칭) &gt; 데이터 유입
<img src="https://velog.velcdn.com/images/jaam_mini/post/2456ed84-1cab-4cc8-9b1d-4221bd806a4a/image.png" alt=""></li>
</ol>
<hr>
<h3 id="--회귀-모델">- 회귀 모델</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/e7746187-273e-4e60-b607-6263dbf3d905/image.png" alt=""></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/cebf23da-ef2e-4708-b8a9-5ca62b49156a/image.png" alt=""></li>
</ul>
<h3 id="--1차-함수">- 1차 함수</h3>
<ul>
<li>기울기 &amp; y절편이 있음
<img src="https://velog.velcdn.com/images/jaam_mini/post/a1f878bf-ea68-444a-9551-0a536ef984ec/image.png" alt=""></li>
</ul>
<h3 id="--선형-회귀">- 선형 회귀</h3>
<ul>
<li>내가 가지고 있는 데이터와 가장 잘 맞는 직선을 찾겠다</li>
<li>그리고 그 직선을 hypothesis 라고 한다
<img src="https://velog.velcdn.com/images/jaam_mini/post/7ca0f4b7-e8e3-41be-843c-1cd238f26808/image.png" alt=""></li>
</ul>
<hr>
<h1 id="1-ols--ordinary-linear-least-square">1. OLS : Ordinary Linear Least Square</h1>
<h3 id="1_기본-예제">1_기본 예제</h3>
<pre><code># !pip install statsmodels</code></pre><pre><code># 1) 데이터 설정
import pandas as pd

data = {&#39;x&#39;:[1.,2.,3.,4.,5.], &#39;y&#39;:[1.,3.,4.,6.,5.]}
df = pd.DataFrame(data)
df</code></pre><pre><code># 2) 가설 세우기

import statsmodels.formula.api as smf

# formula=&quot;y~x&quot; : y=ax+b  라는 의미를 내포
lm_model = smf.ols(formula=&quot;y~x&quot;, data=df).fit()</code></pre><pre><code># 3) 결과

lm_model.params</code></pre><pre><code># 4) seaborn

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,7))
sns.lmplot(x=&#39;x&#39;, y=&#39;y&#39;, data=df);

# xlim : plt 축 범위 설정 함수, x축 범위 지정
plt.xlim([0,5])</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/40079da0-9692-4ba7-9517-9696b2353c4c/image.png" alt=""></li>
</ul>
<h3 id="2_잔차-resid-평가">2_잔차 (resid) 평가</h3>
<ul>
<li>잔차 란?<ul>
<li>내 모델과 실제 값의 차이</li>
<li>잔차 평가는 잔차의 평균이 0(=이라서 회귀하는 것임)이고 정규분포를 따라야 함</li>
</ul>
</li>
</ul>
<pre><code># 잔차 확인
resid = lm_model.resid
resid</code></pre><p>▼ 에러값s</p>
<p>0   -0.6
1    0.3
2    0.2
3    1.1
4   -1.0
dtype: float64</p>
<h3 id="3_r-squared-결정계수">3_R-Squared (결정계수)</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a6775664-71ff-46c5-be5c-54470152dca2/image.png" alt=""><ul>
<li>녹색 : 평균</li>
<li>분모 : 참값(점)이 가지는 평균으로 부터의 오차(점~녹색 거리)</li>
<li>분자 : 예측값(노랑~녹색 거리)으로 부터 가지는 평균으로의 오차</li>
</ul>
</li>
</ul>
<ul>
<li>참값 = 예측값 : 1 
참값이 예측값과 일치한다면 1임</li>
</ul>
<p></br></br></p>
<ul>
<li>(기본) 결정계수 구하기 <pre><code>import numpy as np
</code></pre></li>
</ul>
<h1 id="df의-y컬럼-평균을-mu로-잡음">df의 y컬럼 평균을 mu로 잡음</h1>
<p>mu = np.mean(df[&#39;y&#39;])
y = df[&#39;y&#39;]</p>
<h1 id="예측값y_hat">예측값(y_hat)</h1>
<p>y_hat = lm_model.predict()</p>
<h1 id="합계">합계</h1>
<h1 id="분자예측값-평균2--분모참값-평균2">분자(예측값-평균)^2 / 분모(참값-평균)^2</h1>
<p>np.sum((y_hat - mu)<strong>2 / np.sum((y - mu)</strong>2))</p>
<pre><code>0.8175675675675673

- (쉽게) 결정계수 구하기 
</code></pre><p>lm_model.rsquared</p>
<pre><code>0.8175675675675677


- 분포도 확인</code></pre><h1 id="잔차의-분포도-확인">잔차의 분포도 확인</h1>
<p>sns.distplot(resid, color=&#39;black&#39;);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/422a552b-1851-482f-b210-50afeb43014d/image.png)

## 2. 통계적 회귀
---</code></pre><h1 id="모듈">모듈</h1>
<p>import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns</p>
<pre><code></code></pre><h1 id="1-데이터-로드">1) 데이터 로드</h1>
<p>data_url = &#39;<a href="https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/ecommerce.csv&#39;">https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/ecommerce.csv&#39;</a>
data = pd.read_csv(data_url)</p>
<pre><code></code></pre><h1 id="2-구조-확인">2) 구조 확인</h1>
<p>data.tail()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/a449b9f8-a6ad-431b-bdcf-ef3b9c01b822/image.png)
</code></pre><h1 id="3-컬럼-확인">3) 컬럼 확인</h1>
<p>data.columns</p>
<pre><code>Index([&#39;Email&#39;, &#39;Address&#39;, &#39;Avatar&#39;, &#39;Avg. Session Length&#39;, &#39;Time on App&#39;,
       &#39;Time on Website&#39;, &#39;Length of Membership&#39;, &#39;Yearly Amount Spent&#39;],
      dtype=&#39;object&#39;)
</code></pre><h1 id="4-필요-없는-컬럼-삭제">4) 필요 없는 컬럼 삭제</h1>
<p>data.drop([&#39;Email&#39;, &#39;Address&#39;, &#39;Avatar&#39;], axis=1, inplace=True)
data.info()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/845d985e-6ccc-40d2-90fa-0370d7a722cb/image.png)
</code></pre><h1 id="5-컬럼별-boxplot">5) 컬럼별 boxplot</h1>
<p>plt.figure(figsize=(12,6))
sns.boxplot(data=data);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/8d05e637-396f-4f6c-b0da-20b4946c8f89/image.png)
</code></pre><h1 id="6-특정-칼럼-다시-boxplot">6) 특정 칼럼 다시 boxplot</h1>
<p>plt.figure(figsize=(12,6))
sns.boxplot(data=data.iloc[:, :-1]);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/d10a4037-d229-4976-a01d-cd44256fc50a/image.png)
  - ```
    data.iloc[:]
    ```
  - ![](https://velog.velcdn.com/images/jaam_mini/post/b1a1112d-2f54-4a2d-8cbb-5ba67c92d721/image.png)

  - ```
    data.iloc[:, :-1]
    ```
  - ![](https://velog.velcdn.com/images/jaam_mini/post/200f9d70-6f9a-4718-a98b-47d9b54d6017/image.png)
</code></pre><h1 id="7-label-값에-대한-boxplot">7) label 값에 대한 boxplot</h1>
<p>plt.figure(figsize=(12,6))
sns.boxplot(data=data[&#39;Yearly Amount Spent&#39;]);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/c5129e03-30fa-48d1-bb9e-6d772e5ce1c9/image.png)
</code></pre><h1 id="8-pairplot으로-경향-확인">8) pairplot으로 경향 확인</h1>
<p>plt.figure(figsize=(12,6))
sns.pairplot(data=data);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/cf2cec6e-a9b7-41fa-b5f4-b4ba57fbd0c2/image.png)</code></pre><h1 id="10-상관관계를-갖는-것을-lmplot으로-확인">10) 상관관계를 갖는 것을 lmplot으로 확인</h1>
<p>plt.figure(figsize=(12,6))
sns.lmplot(x=&#39;Length of Membership&#39;, y=&#39;Yearly Amount Spent&#39;, data=data);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/d773b894-e1b8-4b61-bcd7-508269a68a5e/image.png)
</code></pre><h1 id="11-상관이-높은-멤버십-유지기간-만-가지고-통계적-회귀">11) 상관이 높은 멤버십 유지기간 만 가지고 통계적 회귀</h1>
<p>import statsmodels.api as sm
X = data[&#39;Length of Membership&#39;]
y = data[&#39;Yearly Amount Spent&#39;]
lm = sm.OLS(y, X).fit()
lm.summary()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/e1598fe4-9633-46e1-a8cd-0967767ee7f0/image.png)
</code></pre><h1 id="12-회귀-모델-그리기">12) 회귀 모델 그리기</h1>
<p>pred = lm.predict(X)</p>
<p>sns.scatterplot(x=X, y=y)
plt.plot(X, pred, &#39;r&#39;, ls=&#39;dashed&#39;, lw=3)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/da9460b7-8b8c-4cb9-888d-1e1483ed92e5/image.png)</code></pre><h1 id="13-참-값-예측값-그리기">13) 참 값, 예측값 그리기</h1>
<p>sns.scatterplot(x=y, y=pred)
plt.plot([min(y), max(y)], [min(y), max(y)], &#39;r&#39;, ls=&#39;dashed&#39;, lw=3);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/9b348fcf-66ef-4969-a378-d79db9cd2535/image.png)
</code></pre><h1 id="14-참-값-예측값-그리기">14) 참 값, 예측값 그리기</h1>
<p>sns.scatterplot(x=y, y=pred)
plt.plot([min(y), max(y)], [min(y), max(y)], &#39;r&#39;, ls=&#39;dashed&#39;, lw=3);3
plt.plot([0,max(y)], [0, max(y)], &#39;b&#39;, ls=&#39;dashed&#39;, lw=3);
plt.axis([0,max(y), 0, max(y)])</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/eb0bebcb-4d32-407f-8360-e462869fdc7b/image.png)
</code></pre><h1 id="15-상수항-추가-열추가">15) 상수항 추가 (열추가)</h1>
<h1 id="c_를-해주면-바아로-삽입됨">c_를 해주면 바아로 삽입됨</h1>
<h1 id="추가-원래-x에-1을-x의-길이-만큼-만들어서">추가 [원래 X에, 1을 X의 길이 만큼 만들어서]</h1>
<p>X = np.c_[X, [1]*len(X)]</p>
<h1 id="잘-추가-됐는지-5개만-보기">잘 추가 됐는지 5개만 보기</h1>
<p>X[:5]</p>
<pre><code>array([[4.08262063, 1.        ],
       [2.66403418, 1.        ],
       [4.1045432 , 1.        ],
       [3.12017878, 1.        ],
       [4.44630832, 1.        ]])

</code></pre><h1 id="16-다시-fit">16) 다시 fit()</h1>
<p>lm = sm.OLS(y, X).fit()
lm.summary()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/68e2e591-f938-49cb-9863-111f8f7cf8a3/image.png)

  - 아까와는 다르게 x1의 밸류와 constant가 잡힘
  - R aquared가 작아짐
    - R aquared : 평균을 기준으로 데이터가 얼마나 예측과 실체값 간의 편차
  - AIC 가 작아짐 (낮을 수 록 좋음)
    - AIC : 내가 만들어낸 모델이 나의 데이터를 얼마나 잘 반영하는지 측정하는 도구 (=원래 정보를 얼마나 손실 시키는지의 정도)

</code></pre><h1 id="17-다시-선형-회귀">17) 다시 선형 회귀</h1>
<p>pred = lm.predict(X)</p>
<p>sns.scatterplot(x=X[:, 0], y=y)
plt.plot(X[:, 0], pred, &#39;r&#39;, ls=&#39;dashed&#39;, lw=3)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/7dae7fae-b154-4894-a384-967912f69fc3/image.png)</code></pre><h1 id="18-참-값-예측값-그리기">18) 참 값, 예측값 그리기</h1>
<h1 id="x참값-y예측값">(x=참값, y=예측값)</h1>
<p>sns.scatterplot(x=y, y=pred)
plt.plot([min(y), max(y)], [min(y), max(y)], &#39;r&#39;, ls=&#39;dashed&#39;, lw=3);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/94603d0a-e45b-4df9-8956-d2b9f401e5ea/image.png)
</code></pre><h1 id="19-데이터-분리-후">19) 데이터 분리 후</h1>
<p>from sklearn.model_selection import train_test_split</p>
<p>X = data.drop(&#39;Yearly Amount Spent&#39;, axis=1)
y = data[&#39;Yearly Amount Spent&#39;]</p>
<p>X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=13
)</p>
<pre><code></code></pre><h1 id="20-4개-컬럼-모두-변수로-회귀">20) 4개 컬럼 모두 변수로 회귀</h1>
<p>import statsmodels.api as sm</p>
<p>lm = sm.OLS(y_train, X_train).fit()
lm.summary()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/4fdd5cb7-5e94-45cb-9f1a-c78263fb8f96/image.png)

- 이전 값보다
  - R aquared가 높아짐
  - AIC 가 작아짐 (낮을 수 록 좋음)
</code></pre><h1 id="21-참값-vs-예측값">21) 참값 vs 예측값</h1>
<p>pred = lm.predict(X_test)</p>
<p>sns.scatterplot(x=y_test, y=pred)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], &#39;r&#39;, ls=&#39;dashed&#39;, lw=3);</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/1bba58d0-383e-4971-b120-56af25766354/image.png)</code></pre>]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 5. Model Evaluation _ 함수 & box plot ]]></title>
            <link>https://velog.io/@jaam_mini/ML-5.-Model-Evaluation-%ED%95%A8%EC%88%98-box-plot</link>
            <guid>https://velog.io/@jaam_mini/ML-5.-Model-Evaluation-%ED%95%A8%EC%88%98-box-plot</guid>
            <pubDate>Mon, 22 Jan 2024 04:43:54 GMT</pubDate>
            <description><![CDATA[<h1 id="기초-수학-개념">기초 수학 개념</h1>
<hr>
<ol>
<li>회귀 모델</li>
</ol>
<ul>
<li>내가 가지고 있는 데이터를 직선으로 만들어 두고, 각 값들을 예측하는 것</li>
<li>(회귀 모델 예측 결과) : 연속된 변수값</li>
</ul>
<ol start="2">
<li>분류 모델</li>
</ol>
<ul>
<li>구분이 명확함</li>
<li>몇개의 종류에서 값을 찾아내는 것 (iris, 와인 프로젝트)</li>
</ul>
<ol start="3">
<li>이진 분류</li>
</ol>
<ul>
<li>0 과 1</li>
<li>맞다, 아니다</li>
<li>전체 데이터 에서 실제 1의 값을 가진 데이터<ul>
<li>TP = 실제 1인데 1로 맞춘 것</li>
<li>FN = 실제 1인데 틀리게 예측한 값</li>
</ul>
</li>
<li>전체 데이터 에서 0의 값을 가진 데이터(아래)<ul>
<li>TN = 0을 0으로 맞춤</li>
<li>FP = 0을 1이라고 클리게 맞춤</li>
</ul>
</li>
</ul>
<ol start="4">
<li>Accuracy</li>
</ol>
<ul>
<li>전체 데이터 중 맞게 예측한 것의 비율</li>
</ul>
<ol start="5">
<li>Precision</li>
</ol>
<ul>
<li>내가 1이라고 말한(예측한) 것들 중에서 실제 1인 것의 비율</li>
</ul>
<ol start="6">
<li>recall (재현율)</li>
</ol>
<ul>
<li>실제 1일 데이터 중에서 1이라고 예측</li>
</ul>
<ol start="7">
<li>fall out (FPR)</li>
</ol>
<ul>
<li>실제 0 중에서 1이라고 잘못 예측한 것</li>
</ul>
<ol start="8">
<li>F1-Score (조합평균)</li>
</ol>
<ul>
<li>recall + recision 결합한 지표</li>
<li>어느 한쪽으로 치우치지 않고 둘다 높은 값을 가질 수록 높은 값을 가짐</li>
</ul>
<ol start="9">
<li>ROC 곡선</li>
</ol>
<ul>
<li>FPR(= fall out)이 변할때 TPR(= Recall)의 변화를 그린 그림</li>
<li>FPR을 X축, TPR을 Y축</li>
<li>직선에 가까울 수록 머신러닝 모델의 성능이 떨어지는 것으로 판단</li>
</ul>
<ol start="10">
<li>AUC</li>
</ol>
<ul>
<li>ROC 커브의 밑에 면적</li>
</ul>
<h1 id="1-roc-커브-그리그">1. ROC 커브 그리그</h1>
<hr>
<pre><code># 1) 데이터 불러오기 &amp; concat

import pandas as pd

red_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;
white_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;
red_wine = pd.read_csv(red_url, sep=&#39;;&#39;)
white_wine = pd.read_csv(white_url, sep=&#39;;&#39;)

red_wine[&#39;color&#39;] = 1.
white_wine[&#39;color&#39;] = 0.

wine = pd.concat([red_wine, white_wine])


# 2) 맛 분류를 위한 데이터 정리
wine[&#39;taste&#39;] = [1. if grade &gt; 5 else 0 for grade in wine[&#39;quality&#39;]]

X = wine.drop([&#39;taste&#39;,&#39;quality&#39;], axis= 1)
y = wine[&#39;taste&#39;]



# 3) 의사 결정 나무 모델 확인
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print(&#39;Train Acc: &#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc: &#39;, accuracy_score(y_test, y_pred_test))</code></pre><p>Train Acc:  0.7294593034442948
Test Acc:  0.7161538461538461</p>
<pre><code># 4) 각 수치 구하기

from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score, roc_curve)

print(&#39;accuracy : &#39;, accuracy_score(y_test, y_pred_test))
print(&#39;recall :&#39;, recall_score(y_test, y_pred_test))
print(&#39;precision :&#39;, precision_score(y_test, y_pred_test))
print(&#39;AUC score : &#39;, roc_auc_score(y_test, y_pred_test))
print(&#39;F1-score&#39;, f1_score(y_test, y_pred_test))</code></pre><p>accuracy :  0.7161538461538461
recall : 0.7314702308626975
precision : 0.8026666666666666
AUC score :  0.7105988470875331
F1-score 0.7654164017800381</p>
<pre><code>wine_tree.predict_proba(X_test)</code></pre><p>array([[0.61602594, 0.38397406],
       [0.61602594, 0.38397406],
       [0.12197802, 0.87802198],
       ...,
       [0.12197802, 0.87802198],
       [0.61602594, 0.38397406],
       [0.12197802, 0.87802198]])</p>
<pre><code># 5) 그리기

# 모듈
import matplotlib.pyplot as plt
%matplotlib inline

# X_test에 대한 predict_proba를 먼저 찾음
# 위에서 학습한 wine_tree의 predict_proba 함수에 X_test에를 넣어 줌
# [:, 1] : 1인 확률들만 취득 (위 결과값의 배열 = [0,1])
pred_proba = wine_tree.predict_proba(X_test)[:, 1]

# pred_proba : 위 결과값
# pred_proba를 roc_curve에 넣고 확률값을 확인
fpr, tpr, thresholds = roc_curve(y_test, pred_proba)

plt.figure(figsize=(10,8))
plt.plot([0,1], [0,1], &#39;o&#39;, ls=&#39;dashed&#39;)

# x축, y축
plt.plot(fpr, tpr)

plt.grid()
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/884117ed-801d-46b4-91d2-9390adfaaf35/image.png" alt=""></li>
</ul>
<h1 id="2함수의-기초">2.함수의 기초</h1>
<hr>
<h3 id="1-다항함수">1) 다항함수</h3>
<pre><code># 1) 기본 

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.style.use(&#39;seaborn-whitegrid&#39;)

# -3 ~ 2 까지 100개 만들기
x = np.linspace(-3,2,100)
y = 3*x**2 + 2

plt.figure(figsize=(6,4))
plt.plot(x,y)

# 수학 기호 (이탤릭체)
plt.xlabel(&#39;$x$&#39;, fontsize=25)
plt.ylabel(&#39;$3x^2 +2$&#39;, fontsize=25)

plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a97deb2e-b7ab-4d31-a150-1e0970be4f8a/image.png" alt=""></li>
</ul>
<pre><code># 2) x축 방향 이동
x = np.linspace(-5,5,100)
y1 = 3*x**2 +2
y2 = 3*(x+1)**2 +2

plt.figure(figsize=(6,4))
plt.plot(x, y1, lw=2, ls=&#39;dashed&#39;, label=&#39;$y=3x^2 +2$&#39;)
plt.plot(x, y2, label=&#39;$y=3(x+1)^2 +2$&#39;)
plt.legend(fontsize=15)
plt.xlabel(&#39;$x$&#39;, fontsize=25)
plt.ylabel(&#39;$y$&#39;, fontsize=25)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/024eb7ab-1040-4bc7-a655-196b9fc5c840/image.png" alt=""></li>
</ul>
<h3 id="2-지수함수">2) 지수함수</h3>
<pre><code>x = np.linspace(-2,2,100)
a11, a12, a13 = 2,3,4
y11, y12, y13 = a11**x, a12**x, a13**x

a21, a22, a23 = 1/2, 1/3, 1/4
y21, y22, y23 = a21**x, a22**x, a23**x</code></pre><pre><code># 1) 그래프

# plt.subplots(1행, 2열, 사이즈)
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

ax[0].plot(x, y11, color=&#39;k&#39;, label=r&quot;$2^x$&quot;)
ax[0].plot(x, y12, &#39;--&#39;, color=&#39;k&#39;, label=r&quot;$3^x$&quot;)
ax[0].plot(x, y13, &#39;:&#39;, color=&#39;k&#39;, label=r&quot;$4^x$&quot;)
ax[0].legend(fontsize=20)

ax[1].plot(x, y21, color=&#39;k&#39;, label=r&quot;$(1/2)^x$&quot;)
ax[1].plot(x, y22, &#39;--&#39;, color=&#39;k&#39;, label=r&quot;$(1/3)^x$&quot;)
ax[1].plot(x, y23, &#39;:&#39;, color=&#39;k&#39;, label=r&quot;$(1/4)^x$&quot;)
ax[1].legend(fontsize=20)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/96a2b431-a2cc-4bd6-96c9-58d5f8929e06/image.png" alt=""></li>
</ul>
<h3 id="3특이한-지수">3)특이한 지수</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/c5a41258-d065-4b8d-ad84-e4133f888f98/image.png" alt=""></li>
</ul>
<pre><code># 어떤 함수 인지 대략 확인
import numpy as np

x = np.array([10,100,1000,10000,100000])
(1+1/x)**x</code></pre><p>array([2.59374246, 2.70481383, 2.71692393, 2.71814593, 2.71826824])</p>
<p>=&gt; x 값이 커질 수록 어떠한 값으로 수렴(----)하는 함수임을 확인</p>
<h3 id="4-로그함수">4) 로그함수</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/1f514fcb-a808-4e2b-b1ce-500d18ccb9d3/image.png" alt=""></li>
</ul>
<pre><code># 데이터 준비

# 로그함수를 만들고 싶다면
def log(x, base):
    # 밑수(base)를 return으로 지정
    return np.log(x)/np.log(base)

x1 = np.linspace(0.0001, 5, 1000)
x2 = np.linspace(0.01, 5, 100)

y11, y12 = log(x1, 10), log(x2, np.e)
y21, y22 = log(x1, 1/10), log(x2, 1/np.e)</code></pre><pre><code># 그리기 준비
fig, ax = plt.subplots(1,2, figsize=(12, 6))

ax[0].plot(x1, y11, color=&#39;k&#39;, label=r&#39;$\log_{10} x$&#39;)
ax[0].plot(x2, y12, &#39;--&#39;, color=&#39;k&#39;, label=r&#39;$\log_{e} x$&#39;)

ax[0].set_xlabel(&#39;$x$&#39;, fontsize=25)
ax[0].set_ylabel(&#39;$y$&#39;, fontsize=25)
ax[0].legend(fontsize=20, loc=&#39;lower right&#39;)

ax[1].plot(x1, y21, color=&#39;k&#39;, label=r&#39;$\log_{1/10} x$&#39;)
ax[1].plot(x2, y22, &#39;--&#39;, color=&#39;k&#39;, label=r&#39;$\log_{1/e} x$&#39;)

ax[1].set_xlabel(&#39;$x$&#39;, fontsize=25)
ax[1].set_ylabel(&#39;$y$&#39;, fontsize=25)
ax[1].legend(fontsize=20, loc=&#39;upper right&#39;)

plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/43d9ed73-e305-4e64-85a7-3cab7db7d5e5/image.png" alt=""></li>
</ul>
<h3 id="5-시그모이드">5) 시그모이드</h3>
<p>0 과 1사이의 값을 가진다
0으로 수렴하지 않음
무조건 1로 수렴함</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/db83f9cc-6353-4cc4-b287-f099e682cafd/image.png" alt=""><pre><code>z = np.linspace(-10,10,100)
sigma = 1/(1+np.exp(-z))
</code></pre></li>
</ul>
<p>plt.figure(figsize=(12,8))
plt.plot(z, sigma)
plt.xlabel(&#39;$z$&#39;, fontsize=25)
plt.ylabel(&#39;$\sigma(z)$&#39;, fontsize=25)
plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/29fcd8f8-d231-43cf-828f-13109d4a6894/image.png)


# 3. 함수의 표현
---

### 1_벡터의 표현
- ![](https://velog.velcdn.com/images/jaam_mini/post/bac3f759-08f8-47e3-88b2-c9de06f8f06b/image.png)


### 2_스칼라 함수
- 단일변수 스칼라함수
  - ![](https://velog.velcdn.com/images/jaam_mini/post/ea7040f1-fc6e-4056-8e2c-19c8ed7a63a7/image.png)

- 다중변수 스칼라 함수
  - ![](https://velog.velcdn.com/images/jaam_mini/post/b8835364-bac2-48e6-b153-313a7e720c64/image.png)



### 3_다변수 벡터 함수
- ![](https://velog.velcdn.com/images/jaam_mini/post/ddb4377d-39ff-4b44-bca9-c19d439aa2ca/image.png)

- ![](https://velog.velcdn.com/images/jaam_mini/post/e248ac51-c68d-4537-b2ac-cd33d73d4562/image.png)
</code></pre><p>u = np.linspace(0,1,30)
v = np.linspace(0,1,30)</p>
<h1 id="npmeshgrid--uv의-많은-값을-한번에-계산점찍고하고-싶을-때-meshgrid를-사용">np.meshgrid : u,v의 많은 값을 한번에 계산(점찍고)하고 싶을 때 meshgrid를 사용</h1>
<p>U, V = np.meshgrid(u, v)</p>
<p>Z = (1+U<strong>2) + V/(1+V</strong>2)</p>
<p>fig = plt.figure(figsize=(7,7))</p>
<h1 id="projection--3d로-그리는-명령어">projection : 3D로 그리는 명령어</h1>
<p>ax = plt.axes(projection=&#39;3d&#39;)</p>
<p>ax.xaxis.set_tick_params(labelsize=10)
ax.yaxis.set_tick_params(labelsize=10)
ax.zaxis.set_tick_params(labelsize=10)</p>
<p>ax.set_xlabel(&#39;$x$&#39;,fontsize=10)
ax.set_ylabel(&#39;$y$&#39;,fontsize=10)
ax.set_zlabel(&#39;$z$&#39;,fontsize=10)</p>
<p>ax.scatter3D(U, V, Z, marker=&#39;.&#39;, color=&#39;gray&#39;)
plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/c590f070-25fd-45e4-b8ef-4aa2d1f00239/image.png)

### 4_함수의 합성
log 함수는 x가 크면 조금 변하고, x가 작으면 많이 변한다
- ![](https://velog.velcdn.com/images/jaam_mini/post/4268eae3-1cf6-48f5-a133-a8ba64e8516d/image.png)

- 각 함수의 모양 확인</code></pre><p>x = np.linspace(-4, 4, 100)</p>
<h1 id="fx">f(x)</h1>
<p>y = x*<em>3 - 15</em>x + 30</p>
<h1 id="gy">g(y)</h1>
<p>z = np.log(y)</p>
<p>fig, ax = plt.subplots(1, 2, figsize=(12, 6))</p>
<p>ax[0].plot(x, y, label=r&#39;$x^3 - 15x + 30$&#39;, color=&#39;k&#39;)
ax[0].legend(fontsize=18)</p>
<p>ax[1].plot(y, z, label=r&#39;$\log(y)$&#39;, color=&#39;k&#39;)
ax[1].legend(fontsize=18)</p>
<p>plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/db64ebab-102e-4d6e-a0e7-1901c93607b9/image.png)


- 합성 함수 모양 확인</code></pre><p>x = np.linspace(-4, 4, 100)</p>
<h1 id="fx-1">f(x)</h1>
<p>y = x*<em>3 - 15</em>x + 30</p>
<h1 id="gy-1">g(y)</h1>
<p>z = np.log(y)</p>
<p>fig, ax = plt.subplots(1, 2, figsize=(12, 6))</p>
<p>ax[0].plot(x, z, &#39;--&#39;, label=r&#39;$\log(f(x))$&#39;, color=&#39;k&#39;)
ax[0].legend(fontsize=18)</p>
<p>ax[1].plot(x, y, label=r&#39;$x^3 - 15x + 30$&#39;, color=&#39;k&#39;)
ax[1].legend(fontsize=18)</p>
<h1 id="2번째-그림에서-x축을-하나-더-만들라는-명령">2번째 그림에서 x축을 하나 더 만들라는 명령</h1>
<p>ax_tmp = ax[1].twinx()
ax_tmp.plot(x,z, &#39;--&#39;, label=r&#39;$\log(f(x))$&#39;, color=&#39;k&#39;)</p>
<p>plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/21ae407d-d13b-49ab-a6e7-4661961711e2/image.png)


# 4. boxplot
----
boxplot을 이용해서 
몇% 지점의 데이터를 찾아서 버리거나 검토하는 용도를 배우고자 함

- ![](https://velog.velcdn.com/images/jaam_mini/post/18a8a172-7551-4394-841f-5e1027fa69ac/image.png)

- 예제</code></pre><h1 id="모듈">모듈</h1>
<p>import matplotlib.pyplot as plt</p>
<p>samples = [1,7,9,16,36,39,45,45,46,48,51,100, 101]</p>
<h1 id="1이-lensample-만큼-있기를-기대함">[1]이 len(sample) 만큼 있기를 기대함</h1>
<p>tmp_y = [1]*len(samples)</p>
<p>plt.figure(figsize=(12,4))
plt.scatter(samples, tmp_y)
plt.grid()
plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/e9caf556-7b83-47d5-a518-ea986078fc97/image.png)

- 각 지표 찾는 방법
  - ![](https://velog.velcdn.com/images/jaam_mini/post/aaa1d1d0-ff98-4f0b-8ca8-fc9e611ce263/image.png)

- 그래프로 아웃라이어 확인</code></pre><h1 id="중간값">중간값</h1>
<p>import numpy as np
np.median(samples)</p>
<h1 id="-25프로-지점-찾기"># 25프로 지점 찾기</h1>
<h1 id="nppercentilesamples-25">np.percentile(samples, 25)</h1>
<h1 id="-75프로-지점-찾기"># 75프로 지점 찾기</h1>
<h1 id="nppercentilesamples-75">np.percentile(samples, 75)</h1>
<h1 id="-중앙값-찾기"># 중앙값 찾기</h1>
<h1 id="nppercentilesamples-75---nppercentilesamples-25">np.percentile(samples, 75) - np.percentile(samples, 25)</h1>
<p>iqr = q3 - q1
q1 = np.percentile(samples, 25)
q2 = np.median(samples)
q3 = np.percentile(samples, 75)</p>
<h1 id="iqr">IQR</h1>
<p>upper_fence = q3 + iqr<em>1.5
lower_fence = q1 - iqr</em>1.5</p>
<p>plt.figure(figsize=(12,4))
plt.scatter(samples, tmp_y)
plt.axvline(x=q1, color=&#39;black&#39;)
plt.axvline(x=q2, color=&#39;red&#39;)
plt.axvline(x=q3, color=&#39;black&#39;)
plt.axvline(x=upper_fence, color=&#39;black&#39;, ls=&#39;dashed&#39;)
plt.axvline(x=lower_fence, color=&#39;black&#39;, ls=&#39;dashed&#39;)
plt.grid()
plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/f00aa69d-2c53-408d-ae23-ae0b98cc4fbb/image.png)

- seaboen boxplot</code></pre><p>import seaborn as sns
plt.figure(figsize=(3,6))
sns.boxplot(samples)
plt.grid()
plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/c6b99976-c4ac-4ed5-91c5-c10e1fd14080/image.png)


















</code></pre>]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 4. Decision tree, Pipeline, 하이퍼파라미터 튜닝 - 와인 분석]]></title>
            <link>https://velog.io/@jaam_mini/ML-4.-Decision-tree-Pipeline-%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D-%EC%99%80%EC%9D%B8-%EB%B6%84%EC%84%9D</link>
            <guid>https://velog.io/@jaam_mini/ML-4.-Decision-tree-Pipeline-%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D-%EC%99%80%EC%9D%B8-%EB%B6%84%EC%84%9D</guid>
            <pubDate>Mon, 22 Jan 2024 03:16:43 GMT</pubDate>
            <description><![CDATA[<h1 id="1-와인데이터-분석">1. 와인데이터 분석</h1>
<hr>
<h3 id="1_데이터-읽어오기">1_데이터 읽어오기</h3>
<pre><code>import pandas as pd

red_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;
white_url = &#39;https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;</code></pre><pre><code>red_wine = pd.read_csv(red_url, sep=&#39;;&#39;)
white_wine = pd.read_csv(white_url, sep=&#39;;&#39;)

# (주의) ; 로 해야 아래 처럼 뜸. :로 하면 이상하게 뜬다...</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/e4df9520-aa3a-4a02-b139-2efdea257eb2/image.png" alt=""></li>
</ul>
<h3 id="2_컬럼조사">2_컬럼조사</h3>
<pre><code>white_wine.columns</code></pre><p>Index([&#39;fixed acidity&#39;, &#39;volatile acidity&#39;, &#39;citric acid&#39;, &#39;residual sugar&#39;,
       &#39;chlorides&#39;, &#39;free sulfur dioxide&#39;, &#39;total sulfur dioxide&#39;, &#39;density&#39;,
       &#39;pH&#39;, &#39;sulphates&#39;, &#39;alcohol&#39;, &#39;quality&#39;],
      dtype=&#39;object&#39;)</p>
<h3 id="3_데이터합치기">3_데이터합치기</h3>
<p>데이터를 합치기 전에 레드/화이트 구분을 지어줘야 함</p>
<pre><code>red_wine[&#39;color&#39;] = 1.
white_wine[&#39;color&#39;] = 0.

wine = pd.concat([red_wine, white_wine])
wine.info()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/1bd6da34-823a-4f72-ac49-1f8350e06961/image.png" alt=""></li>
</ul>
<h3 id="4_quality-컬럼-histogram">4_[&#39;quality&#39;] 컬럼 histogram</h3>
<pre><code>wine[&#39;quality&#39;].unique()</code></pre><p>array([5, 6, 7, 4, 8, 3, 9], dtype=int64)</p>
<pre><code>import plotly.express as px

# 데이터는 wine, x축은 quality
fig = px.histogram(wine, x=&#39;quality&#39;)
fig.show()</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/b392902a-a956-41f1-84ee-b58ae36ad055/image.png" alt=""></p>
<h3 id="5_등급별-histogram-레드화이트">5_등급별 histogram (레드/화이트)</h3>
<pre><code># y자리에 color 로 함으로써 데이터별 색상을 넣어 줌
fig = px.histogram(wine, x=&#39;quality&#39;, color=&#39;color&#39;)
fig.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/80107cf9-0862-45b6-a311-8bc439099d8d/image.png" alt=""></li>
</ul>
<h3 id="6_분류기-레드화이트">6_분류기 (레드/화이트)</h3>
<pre><code># 1) feature data = 레드/화이트 맞추기
X = wine.drop([&#39;color&#39;], axis=1)

# 2) label data = 맞추고 싶은 대상
y = wine[&#39;color&#39;]</code></pre><pre><code># 3) 훈련/테스트용 설정 (train/test split)

# 모듈
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# y_train에 어떤게 있는지 확인, return_counts(갯수) 확인
np.unique(y_train, return_counts=True)</code></pre><p>(array([0., 1.]), array([3913, 1284], dtype=int64))</p>
<pre><code># 4) 어느 정도 구분되었는지 Histogram으로 확인

# graph_objects 모듈
import plotly.graph_objects as go

# Figure 호출
fig = go.Figure()

# go에서 Histogram을 가지고 옴
fig.add_trace(go.Histogram(x=X_train[&#39;quality&#39;], name=&#39;Train&#39;))
fig.add_trace(go.Histogram(x=X_test[&#39;quality&#39;], name=&#39;Test&#39;))

# 설정
# update_layout은 겹쳐지게(overlay)
# 투명도(update_traces)는 0.75
fig.update_layout(barmode=&#39;overlay&#39;)
fig.update_traces(opacity=0.75)
fig.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/19356256-1854-47e1-9073-6a8bbbb5338e/image.png" alt=""></li>
</ul>
<h3 id="7_decision-tree">7_Decision tree</h3>
<pre><code># 1) fit(학습)

# 모듈
from sklearn.tree import DecisionTreeClassifier

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/6235b0be-1423-477e-9210-1317e658eda4/image.png" alt=""></li>
</ul>
<pre><code># 2) train accuracy(학습 결과 확인)

# 모듈
from sklearn.metrics import accuracy_score

# predict(훈련된 값)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

# 결과 확인 (참값, 예측값)
print(&#39;Train Acc :&#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc :&#39;, accuracy_score(y_test, y_pred_test))</code></pre><p>Train Acc : 0.9553588608812776
Test Acc : 0.9569230769230769</p>
<h3 id="8_데이터-전처리">8_데이터 전처리</h3>
<pre><code>X.columns

# feature data = 레드/화이트 맞추기
# X = wine.drop([&#39;color&#39;], axis=1)</code></pre><p>Index([&#39;fixed acidity&#39;, &#39;volatile acidity&#39;, &#39;citric acid&#39;, &#39;residual sugar&#39;,
       &#39;chlorides&#39;, &#39;free sulfur dioxide&#39;, &#39;total sulfur dioxide&#39;, &#39;density&#39;,
       &#39;pH&#39;, &#39;sulphates&#39;, &#39;alcohol&#39;, &#39;quality&#39;],
      dtype=&#39;object&#39;)</p>
<ul>
<li>1) Boxplot</li>
</ul>
<pre><code># graph_objects 모듈
import plotly.graph_objects as go

# Figure 호출
fig = go.Figure()

# go에서 Boxplot 가지고 옴
fig.add_trace(go.Box(y=X[&#39;fixed acidity&#39;], name=&#39;fixed acidity&#39;))
fig.add_trace(go.Box(y=X[&#39;chlorides&#39;], name=&#39;chlorides&#39;))
fig.add_trace(go.Box(y=X[&#39;quality&#39;], name=&#39;quality&#39;))

fig.show()</code></pre><ul>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/cb1f8f52-83b4-4d39-8175-8796dfe700ff/image.png" alt=""></p>
</li>
<li><p>2) MinMaxScaler &amp; StandardScaler 중 어떤게 좋을지 확인</p>
<pre><code># 모듈
from sklearn.preprocessing import MinMaxScaler, StandardScaler
</code></pre></li>
</ul>
<h1 id="인스턴시에이션-instantiation--이름을-가진-독립된-객체를-다룰-수-있게-함">인스턴시에이션 (instantiation) : 이름을 가진 독립된 객체를 다룰 수 있게 함</h1>
<h1 id="둘-다-해봐야-어떤-것이-좋은지-알-수-있음">둘 다 해봐야 어떤 것이 좋은지 알 수 있음</h1>
<p>MMS = MinMaxScaler()
SS = StandardScaler()</p>
<h1 id="fit">fit()</h1>
<p>MMS.fit(X)
SS.fit(X)</p>
<h1 id="transform">transform()</h1>
<p>X_mms = MMS.transform(X)
X_ss = SS.transform(X)</p>
<h1 id="그래프를-그리고-싶어서-dataframe을-만듬">그래프를 그리고 싶어서 DataFrame을 만듬</h1>
<p>X_mms_pd = pd.DataFrame(X_mms, columns=X.columns)
X_ss_pd = pd.DataFrame(X_ss, columns=X.columns)</p>
<pre><code>- 3) MinMaxScaler : 최대/최소값을 1,0으로 강제로 맞춤</code></pre><h1 id="graph_objects-모듈">graph_objects 모듈</h1>
<p>import plotly.graph_objects as go</p>
<h1 id="figure-호출">Figure 호출</h1>
<p>fig = go.Figure()</p>
<h1 id="go에서-boxplot-가지고-옴">go에서 Boxplot 가지고 옴</h1>
<p>fig.add_trace(go.Box(y=X_mms_pd[&#39;fixed acidity&#39;], name=&#39;fixed acidity&#39;))
fig.add_trace(go.Box(y=X_mms_pd[&#39;chlorides&#39;], name=&#39;chlorides&#39;))
fig.add_trace(go.Box(y=X_mms_pd[&#39;quality&#39;], name=&#39;quality&#39;))</p>
<p>fig.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/556a1a7a-ec37-4c08-a491-bc4e6c349483/image.png)

- 4) StandardScaler : 평균을 0, 표준편차를 1로 맞춤</code></pre><h1 id="graph_objects-모듈-1">graph_objects 모듈</h1>
<p>import plotly.graph_objects as go</p>
<h1 id="figure-호출-1">Figure 호출</h1>
<p>fig = go.Figure()</p>
<h1 id="go에서-boxplot-가지고-옴-1">go에서 Boxplot 가지고 옴</h1>
<p>fig.add_trace(go.Box(y=X_ss_pd[&#39;fixed acidity&#39;], name=&#39;fixed acidity&#39;))
fig.add_trace(go.Box(y=X_ss_pd[&#39;chlorides&#39;], name=&#39;chlorides&#39;))
fig.add_trace(go.Box(y=X_ss_pd[&#39;quality&#39;], name=&#39;quality&#39;))</p>
<p>fig.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/93023ba9-160e-4998-96e6-48d1c8dd5583/image.png)

- 5) [함수]로 만들어보기 : MinMaxScaler, StandardScaler</code></pre><h1 id="graph_objects-모듈-2">graph_objects 모듈</h1>
<p>import plotly.graph_objects as go</p>
<h1 id="target_df-만들고">target_df 만들고</h1>
<p>def px_box(target_df):</p>
<pre><code># Figure 호출
fig = go.Figure()

# y값에 target_df 반영
fig.add_trace(go.Box(y=target_df[&#39;fixed acidity&#39;], name=&#39;fixed acidity&#39;))
fig.add_trace(go.Box(y=target_df[&#39;chlorides&#39;], name=&#39;chlorides&#39;))
fig.add_trace(go.Box(y=target_df[&#39;quality&#39;], name=&#39;quality&#39;))

fig.show()</code></pre><pre><code></code></pre><p>px_box(X_mms_pd)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/212394e5-e699-4d75-ba39-ab55774f0411/image.png)
</code></pre><p>px_box(X_ss_pd)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/87d12d79-25a8-44f0-b5d0-5734f1dd178c/image.png)

- 6) MinMaxScaler 적용/학습
</code></pre><p>#split
X_train, X_test, y_train, y_test = train_test_split(X_mms_pd, y, test_size=0.2, random_state=13)</p>
<p>wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)</p>
<h1 id="fit-1">fit</h1>
<p>wine_tree.fit(X_train, y_train)</p>
<h1 id="predict">predict</h1>
<p>y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)</p>
<p>print(&#39;Mms Train Acc :&#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Mms Test Acc :&#39;, accuracy_score(y_test, y_pred_test))</p>
<pre><code>Mms Train Acc : 0.9553588608812776
Mms Test Acc : 0.9569230769230769

- 7) StandardScaler 적용/학습</code></pre><p>#split
X_train, X_test, y_train, y_test = train_test_split(X_ss_pd, y, test_size=0.2, random_state=13)</p>
<p>wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)</p>
<h1 id="fit-2">fit</h1>
<p>wine_tree.fit(X_train, y_train)</p>
<h1 id="predict-1">predict</h1>
<p>y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)</p>
<p>print(&#39;Ss Train Acc :&#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Ss Test Acc :&#39;, accuracy_score(y_test, y_pred_test))</p>
<pre><code>Ss Train Acc : 0.9553588608812776
Ss Test Acc : 0.9569230769230769
</code></pre><h1 id="8-zip---레드화이트-와인-구분-특성">8) zip - 레드/화이트 와인 구분 특성</h1>
<h1 id="x_train의-컬럼-이름을-그대로-사용">X_train의 컬럼 이름을 그대로 사용</h1>
<h1 id="분류기--fit-했던-wine_tree">분류기 : fit 했던 wine_tree</h1>
<h1 id="feature_importances_">feature_importances_</h1>
<pre><code># 트리 기반 모델 중, 밀접한 관련이 있는 피처를 중요도 순으로 나열함
# 가중치가 적은 변수를 제거, 모델의 성능을 최적화 &amp; 정확도를 높임</code></pre><h1 id="zip--dict-로-바꿈">zip-&gt; dict 로 바꿈</h1>
<p>dict(zip(X_train.columns, wine_tree.feature_importances_))</p>
<h1 id="▼-결과">▼ 결과</h1>
<h1 id="max_depth2-로-잡았기-때문에-2개-결과만-값이-0이-아님을-확인할-수-있다">max_depth=2 로 잡았기 때문에, 2개 결과만 값이 0이 아님을 확인할 수 있다</h1>
<pre><code>{&#39;fixed acidity&#39;: 0.0,
 &#39;volatile acidity&#39;: 0.0,
 &#39;citric acid&#39;: 0.0,
 &#39;residual sugar&#39;: 0.0,
 &#39;chlorides&#39;: 0.24230360549660776,
 &#39;free sulfur dioxide&#39;: 0.0,
 &#39;total sulfur dioxide&#39;: 0.7576963945033922,
 &#39;density&#39;: 0.0,
 &#39;pH&#39;: 0.0,
 &#39;sulphates&#39;: 0.0,
 &#39;alcohol&#39;: 0.0,
 &#39;quality&#39;: 0.0}


### 9_이진분류</code></pre><h1 id="1-quality-컬럼-이진화">1) quality 컬럼 이진화</h1>
<h1 id="wine-데이터의-taste-컬럼-생성">wine 데이터의 [&#39;taste&#39;] 컬럼 생성</h1>
<h1 id="quality-column울-grade로-잡고-5등급-보다-크면-1-그게-아니라면-0으로-잡음">quality column울 grade로 잡고, 5등급 보다 크면 1, 그게 아니라면 0으로 잡음</h1>
<p>wine[&#39;taste&#39;] = [1. if grade&gt;5 else 0. for grade in wine[&#39;quality&#39;]]</p>
<p>wine.head()</p>
<pre><code>-- ![](https://velog.velcdn.com/images/jaam_mini/post/a34628c1-9e8f-42c4-b0e1-cae5be080a40/image.png)

</code></pre><h1 id="2-모델링fit">2) 모델링(fit)</h1>
<h1 id="label인-taste를-drop-나머지를-x의-특성으로-봄">label인 taste를 drop, 나머지를 X의 특성으로 봄</h1>
<p>X = wine.drop([&#39;taste&#39;], axis=1)</p>
<h1 id="새로만들-y데이터">새로만들 y데이터</h1>
<p>y = wine[&#39;taste&#39;]</p>
<p>#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)</p>
<h1 id="fit-3">fit</h1>
<p>wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)</p>
<pre><code></code></pre><h1 id="3-평가accuracy">3) 평가(accuracy)</h1>
<h1 id="모듈">모듈</h1>
<p>from sklearn.metrics import accuracy_score</p>
<h1 id="predict훈련된-값">predict(훈련된 값)</h1>
<p>y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)</p>
<h1 id="결과-확인-참값-예측값">결과 확인 (참값, 예측값)</h1>
<p>print(&#39;Train Acc :&#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc :&#39;, accuracy_score(y_test, y_pred_test))</p>
<pre><code>Train Acc : 1.0
Test Acc : 1.0

- 결정트리 모델의 정확도가 1.0이 나온다면, 이는 모델이 학습 데이터에 완벽하게 적합되어 과적합(overfitting)된 상태일 수 있는데 결정트리를 시각화하여 무엇이 잘못되었는지 확인해봐야 함</code></pre><h1 id="4-결정트리">4) 결정트리</h1>
<p>import matplotlib.pyplot as plt
import sklearn.tree as tree</p>
<p>plt.figure(figsize=(6,5))
tree.plot_tree(wine_tree, feature_names=X.columns.tolist());</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/841d1cd7-87c1-414b-86cf-a93e73605efc/image.png)


- taste를 만들었던 quality 컬럼 값이 아직 살아 있음
- quality 컬럼을 가지고 학습하여 1.0이 나온 것임
- 따라서 quality 컬럼을 drop해서 다시 모델을 제작
</code></pre><h1 id="5-droptastequality-후-모델링--평가">5) drop([&#39;taste&#39;,&#39;quality&#39;] 후 모델링 &amp; 평가</h1>
<p>x = wine.drop([&#39;taste&#39;,&#39;quality&#39;], axis=1)
y = wine[&#39;taste&#39;]</p>
<p>x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=13)</p>
<p>wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(x_train, y_train)</p>
<p>y_pred_tr = wine_tree.predict(x_train)
y_pred_test = wine_tree.predict(x_test)</p>
<p>accuracy_score(y_train, y_pred_tr)
accuracy_score(y_test, y_pred_test)</p>
<p>print(&#39;Train Acc: &#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc: &#39;, accuracy_score(y_test, y_pred_test))</p>
<pre><code>[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]
0.709578255462782
[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]
fixed acidity    volatile acidity    citric acid    residual sugar    chlorides    free sulfur dioxide    total sulfur dioxide    density    pH    sulphates    alcohol    quality
0    7.4    0.7    0.0    1.9    0.076    11.0    34.0    0.9978    3.51    0.56    9.4    5
fixed acidity    volatile acidity    citric acid    residual sugar    chlorides    free sulfur dioxide    total sulfur dioxide    density    pH    sulphates    alcohol    quality
0    7.0    0.27    0.36    20.7    0.045    45.0    170.0    1.001    3.0    0.45    8.8    6
Index([&#39;fixed acidity&#39;, &#39;volatile acidity&#39;, &#39;citric acid&#39;, &#39;residual sugar&#39;,
       &#39;chlorides&#39;, &#39;free sulfur dioxide&#39;, &#39;total sulfur dioxide&#39;, &#39;density&#39;,
       &#39;pH&#39;, &#39;sulphates&#39;, &#39;alcohol&#39;, &#39;quality&#39;],
      dtype=&#39;object&#39;)
&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Index: 6497 entries, 0 to 4897
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  color                 6497 non-null   float64
dtypes: float64(12), int64(1)
memory usage: 710.6 KB
array([5, 6, 7, 4, 8, 3, 9], dtype=int64)
(array([0., 1.]), array([3913, 1284], dtype=int64))

DecisionTreeClassifier
DecisionTreeClassifier(max_depth=2, random_state=13)
Train Acc : 0.9553588608812776
Test Acc : 0.9569230769230769
Index([&#39;fixed acidity&#39;, &#39;volatile acidity&#39;, &#39;citric acid&#39;, &#39;residual sugar&#39;,
       &#39;chlorides&#39;, &#39;free sulfur dioxide&#39;, &#39;total sulfur dioxide&#39;, &#39;density&#39;,
       &#39;pH&#39;, &#39;sulphates&#39;, &#39;alcohol&#39;, &#39;quality&#39;],
      dtype=&#39;object&#39;)
Mms Train Acc : 0.9553588608812776
Mms Test Acc : 0.9569230769230769
Ss Train Acc : 0.9553588608812776
Ss Test Acc : 0.9569230769230769
{&#39;fixed acidity&#39;: 0.0,
 &#39;volatile acidity&#39;: 0.0,
 &#39;citric acid&#39;: 0.0,
 &#39;residual sugar&#39;: 0.0,
 &#39;chlorides&#39;: 0.24230360549660776,
 &#39;free sulfur dioxide&#39;: 0.0,
 &#39;total sulfur dioxide&#39;: 0.7576963945033922,
 &#39;density&#39;: 0.0,
 &#39;pH&#39;: 0.0,
 &#39;sulphates&#39;: 0.0,
 &#39;alcohol&#39;: 0.0,
 &#39;quality&#39;: 0.0}
fixed acidity    volatile acidity    citric acid    residual sugar    chlorides    free sulfur dioxide    total sulfur dioxide    density    pH    sulphates    alcohol    quality    color    taste
0    7.4    0.70    0.00    1.9    0.076    11.0    34.0    0.9978    3.51    0.56    9.4    5    1.0    0.0
1    7.8    0.88    0.00    2.6    0.098    25.0    67.0    0.9968    3.20    0.68    9.8    5    1.0    0.0
2    7.8    0.76    0.04    2.3    0.092    15.0    54.0    0.9970    3.26    0.65    9.8    5    1.0    0.0
3    11.2    0.28    0.56    1.9    0.075    17.0    60.0    0.9980    3.16    0.58    9.8    6    1.0    1.0
4    7.4    0.70    0.00    1.9    0.076    11.0    34.0    0.9978    3.51    0.56    9.4    5    1.0    0.0

DecisionTreeClassifier
DecisionTreeClassifier(max_depth=2, random_state=13)
Train Acc : 1.0
Test Acc : 1.0

Train Acc:  0.7294593034442948
Test Acc:  0.7161538461538461

- 6)결정트리 값을 통해 와인의 맛 평가의 기준이 [alchol] 인 것을 확인함
</code></pre><h1 id="6-결정트리">6) 결정트리</h1>
<p>import matplotlib.pyplot as plt
import sklearn.tree as tree</p>
<p>plt.figure(figsize=(12,5))
tree.plot_tree(wine_tree, rounded=True, filled=True, feature_names=X.columns.tolist());
plt.show()</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/19c1428b-7f80-4762-9f55-e3fcb2d56f2b/image.png)

# 2. Pipeline
----
- 단순히 Iris, Wine 데이터를 받아서 사용했을 뿐인데, 직접 공부하면서 코드를 하나씩 실행해보면 혼돈이 크다는 것을 알 수 있다.
- Jupyter Notebook 상황에서 데이터의 전처리와 여러 알고리즘의 반복 실행, 하이퍼 파라미터의 튜닝 과정을 번갈아 하다 보면 코드의 실행 순서에 혼돈이 있을 수 있다.
- 이런 경우 클래스(class)로 만들어서 진행해도 되지만, sklearn 유저에게는 꼭 그럴 필요없이 준비된 기능인 Pipeline이 있다.

</code></pre><h1 id="1-데이터-불러오기--concat">1) 데이터 불러오기 &amp; concat</h1>
<p>import pandas as pd</p>
<p>red_url = &#39;<a href="https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;">https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;</a>
white_url = &#39;<a href="https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;">https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;</a>
red_wine = pd.read_csv(red_url, sep=&#39;;&#39;)
white_wine = pd.read_csv(white_url, sep=&#39;;&#39;)</p>
<p>red_wine[&#39;color&#39;] = 1.
white_wine[&#39;color&#39;] = 0.</p>
<p>wine = pd.concat([red_wine, white_wine])</p>
<p>x = wine.drop([&#39;color&#39;], axis=1)
y = wine[&#39;color&#39;]</p>
<pre><code>- 파이프라인을 한번 짜 놓으면, 호출 시 알아서 진행
</code></pre><h1 id="2-파이프라인-생성">2) 파이프라인 생성</h1>
<h1 id="3가지-모듈">3가지 모듈</h1>
<p>from sklearn.pipeline import Pipeline
from sklearn.tree import  DecisionTreeClassifier
from sklearn.preprocessing import  StandardScaler</p>
<h1 id="변수에-리스트-형-튜플로-지정">변수에 리스트 형, 튜플로 지정</h1>
<p>estimators = [
    (&#39;scaler&#39;, StandardScaler()),
    (&#39;clf&#39;, DecisionTreeClassifier())
]</p>
<h1 id="변수에-파이프라인-설정">변수에 파이프라인 설정</h1>
<p>pipe = Pipeline(estimators)</p>
<pre><code>
- 어떤 스텝으로 움직이는지 확인
    - 첫번쨰 단계는 scaler라고 부르고, StandardScaler() 가 지정되어 있음
    - 두번쨰 단계는 clf라고 부르고, DecisionTreeClassifier() 가 지정되어 있음
</code></pre><p>pipe</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/5cb49bdf-03f3-4ec6-b7aa-e4cf42c32142/image.png)
</code></pre><p>pipe.steps</p>
<pre><code>[(&#39;scaler&#39;, StandardScaler()), (&#39;clf&#39;, DecisionTreeClassifier())]


- 객체 호출 방법
</code></pre><p>pipe.steps[0]</p>
<pre><code>(&#39;scaler&#39;, StandardScaler())

</code></pre><p>pipe[&#39;scaler&#39;]</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/a4b4b01f-8954-4a5d-a1d4-e7420ee1df8c/image.png)


- DecisionTreeClassifier() 에는 지정해야 하는 [파라미터]가 있음
- set_params (스탭이름 “clf” + 언더바 두 개 “- -” + 속성 이름)
</code></pre><h1 id="3-params-접근">3) Params 접근</h1>
<h1 id="decisiontreeclassifier-메서드를-clf로-위에서-정의했고-언더바를-붙여서-max_dept-파라미터를-2로-설정-한-것">DecisionTreeClassifier() 메서드를 &#39;clf&#39;로 위에서 정의했고, 언더바를 붙여서 max_dept 파라미터를 2로 설정 한 것</h1>
<h1 id="즉-언더바-2개를-추가해서-접근했다고-보면-됨-httpsgurutistorycom50">즉, 언더바 2개를 추가해서 접근했다고 보면 됨 (<a href="https://guru.tistory.com/50">https://guru.tistory.com/50</a>)</h1>
<h1 id="clf에-max_depth2를-설정--clf--_-_--max_depth2">clf에 max_depth=2를 설정 : clf + _ _ + max_depth=2</h1>
<p>pipe.set_params(clf__max_depth=2)
pipe.set_params(clf__random_state=13)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/9bf4b8c4-b6a3-49e2-84b8-00de1c66db1e/image.png)
</code></pre><h1 id="4-split--fit">4) split + fit</h1>
<p>from sklearn.model_selection import train_test_split</p>
<h1 id="stratifyy--y데이터의-분로픞-맞춰라">stratify=y : y데이터의 분로픞 맞춰라</h1>
<p>X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)</p>
<h1 id="1-parameter">(1) Parameter</h1>
<h1 id="arrays--분할시킬-데이터를-입력-python-list-numpy-array-pandas-dataframe-등">arrays : 분할시킬 데이터를 입력 (Python list, Numpy array, Pandas dataframe 등..)</h1>
<h1 id="test_size--테스트-데이터셋의-비율float이나-갯수int-default--025">test_size : 테스트 데이터셋의 비율(float)이나 갯수(int) (default = 0.25)</h1>
<h1 id="train_size--학습-데이터셋의-비율float이나-갯수int-default--test_size의-나머지">train_size : 학습 데이터셋의 비율(float)이나 갯수(int) (default = test_size의 나머지)</h1>
<h1 id="random_state--데이터-분할시-셔플이-이루어지는데-이를-위한-시드값-int나-randomstate로-입력">random_state : 데이터 분할시 셔플이 이루어지는데 이를 위한 시드값 (int나 RandomState로 입력)</h1>
<h1 id="shuffle--셔플여부설정-default--true">shuffle : 셔플여부설정 (default = True)</h1>
<h1 id="stratify--지정한-data의-비율을-유지한다">stratify : 지정한 Data의 비율을 유지한다.</h1>
<h1 id="예를-들어-label-set인-y가-25의-0과-75의-1로-이루어진-binary-set일-때-stratifyy로-설정하면-나누어진-데이터셋들도-0과-1을-각각-25-75로-유지한-채-분할된다">예를 들어, Label Set인 Y가 25%의 0과 75%의 1로 이루어진 Binary Set일 때, stratify=Y로 설정하면 나누어진 데이터셋들도 0과 1을 각각 25%, 75%로 유지한 채 분할된다.</h1>
<pre><code>
- (예전) Scaler 통과 + 분류기 학습
- (지금) 이미 선언해둔 pipe 이용

</code></pre><h1 id="5-pipe">5) pipe</h1>
<p>pipe.fit(X_train, y_train)</p>
<pre><code>
- ![](https://velog.velcdn.com/images/jaam_mini/post/1f9d441f-f394-40bc-8418-1569a378cf88/image.png)

</code></pre><h1 id="6-결과-확인">6) 결과 확인</h1>
<p>from sklearn.metrics import accuracy_score</p>
<p>y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)</p>
<p>print(&#39;Train Acc :&#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc :&#39;, accuracy_score(y_test, y_pred_test))</p>
<pre><code>Train Acc : 1.0
Test Acc : 1.0


# 3. 교차검증
---
</code></pre><h1 id="1-데이터-불러오기--concat-1">1) 데이터 불러오기 &amp; concat</h1>
<p>import pandas as pd</p>
<p>red_url = &#39;<a href="https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;">https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;</a>
white_url = &#39;<a href="https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;">https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;</a>
red_wine = pd.read_csv(red_url, sep=&#39;;&#39;)
white_wine = pd.read_csv(white_url, sep=&#39;;&#39;)</p>
<p>red_wine[&#39;color&#39;] = 1.
white_wine[&#39;color&#39;] = 0.</p>
<p>wine = pd.concat([red_wine, white_wine])</p>
<pre><code></code></pre><h1 id="2-맛-분류를-위한-데이터-정리">2) 맛 분류를 위한 데이터 정리</h1>
<p>wine[&#39;taste&#39;] = [1. if grade &gt; 5 else 0 for grade in wine[&#39;quality&#39;]]</p>
<p>X = wine.drop([&#39;taste&#39;,&#39;quality&#39;], axis= 1)
y = wine[&#39;taste&#39;]</p>
<h1 id="3-의사-결정-나무-모델-확인">3) 의사 결정 나무 모델 확인</h1>
<p>from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score</p>
<p>x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=13)</p>
<p>wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(x_train, y_train)</p>
<p>y_pred_tr = wine_tree.predict(x_train)
y_pred_test = wine_tree.predict(x_test)</p>
<p>print(&#39;Train Acc: &#39;, accuracy_score(y_train, y_pred_tr))
print(&#39;Test Acc: &#39;, accuracy_score(y_test, y_pred_test))</p>
<pre><code>Train Acc:  0.7294593034442948
Test Acc:  0.7161538461538461


### 1_KFold()
</code></pre><p>Train Acc:  0.7294593034442948
Test Acc:  0.7161538461538461</p>
<pre><code>- 위 값이 최선인지, acc를 신뢰할 수 있는지 확인하기 위해 KFold(교차검증)이 필요
</code></pre><h1 id="4-kfold">4) KFold</h1>
<h1 id="모듈-1">모듈</h1>
<p>from sklearn.model_selection import KFold</p>
<h1 id="n_splits는-몇-개의-폴드fold로-나눌-것인지를-의미하는-매개변수">n_splits는 몇 개의 폴드(fold)로 나눌 것인지를 의미하는 매개변수</h1>
<h1 id="5겹-교차-검증이-가장-일반적">5겹 교차 검증이 가장 일반적</h1>
<p>kfold = KFold(n_splits=5)</p>
<p>wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)</p>
<pre><code></code></pre><h1 id="5-모델링-학습--결과-확인">5) 모델링 (학습 &amp; 결과 확인)</h1>
<h1 id="기록-보관을-위해-빈리스트생성">기록 보관을 위해 &#39;빈리스트&#39;생성</h1>
<p>cv_accuracy =[]</p>
<p>for train_idx, test_idx in kfold.split(X):</p>
<pre><code># 데이터 구성
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# 학습
wine_tree_cv.fit(X_train, y_train)

# 모델링
pred = wine_tree_cv.predict(X_test)

# 리스트 저장 (결과값)
cv_accuracy.append(accuracy_score(y_test, pred))</code></pre><p>cv_accuracy</p>
<pre><code>[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]
</code></pre><h1 id="6-kfold-평균값-확인">6) KFold 평균값 확인</h1>
<p>import numpy as np</p>
<p>np.mean(cv_accuracy)</p>
<pre><code>0.709578255462782

### 2_StratifiedKFold()
</code></pre><h1 id="7-stratifiedkfold">7) StratifiedKFold</h1>
<p>from sklearn.model_selection import StratifiedKFold</p>
<p>skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)</p>
<p>cv_accuracy = []</p>
<p>for train_idx, test_idx in skfold.split(X, y):</p>
<pre><code># 데이터 구성
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# 학습
wine_tree_cv.fit(X_train, y_train)

# 모델링
pred = wine_tree_cv.predict(X_test)

# 리스트 저장 (결과값)
cv_accuracy.append(accuracy_score(y_test, pred))</code></pre><p>cv_accuracy</p>
<pre><code>[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]
</code></pre><h1 id="8-stratifiedkfold-평균값-확인">8) StratifiedKFold 평균값 확인</h1>
<p>import numpy as np</p>
<p>np.mean(cv_accuracy)</p>
<pre><code>0.6888004974240539

### 3_cross validation

</code></pre><p>from sklearn.model_selection import cross_val_score</p>
<p>skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)</p>
<p>cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)</p>
<pre><code>array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])


### 4_함수로 풀어보기</code></pre><p>def skfold_dt(depth):
    from sklearn.model_selection import cross_val_score</p>
<pre><code>skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)

print(cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold))</code></pre><pre><code></code></pre><p>skfold_dt(3)</p>
<pre><code>[0.56846154 0.68846154 0.71439569 0.73210162 0.75673595]

### 5_train score와 함께 보고 싶다면</code></pre><p>from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, X, y, scoring=None, cv=skfold, return_train_score=True)</p>
<pre><code>{&#39;fit_time&#39;: array([0.01700068, 0.01591516, 0.01597691, 0.01599717, 0.01500058]),
 &#39;score_time&#39;: array([0.00108767, 0.01203322, 0.00293016, 0.00198984, 0.00199056]),
 &#39;test_score&#39;: array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772]),
 &#39;train_score&#39;: array([0.78795459, 0.78045026, 0.77568295, 0.76356291, 0.76279338])}

# 4. 하이퍼파라미터 튜닝
----

- 튜닝 대상
    - 결정나무에서 아직 우리가 튜닝해 볼만한 것은 max_depth이다.
    - 간단하게 반복문으로 max_depth를 바꿔가며 테스트해볼 수 있을 것이다.
    - 그런데 앞으로를 생각해서 보다 간편하고 유용한 방법을 생각해보자.
</code></pre><h1 id="1-데이터-불러오기--concat-2">1) 데이터 불러오기 &amp; concat</h1>
<p>import pandas as pd</p>
<p>red_url = &#39;<a href="https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;">https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv&#39;</a>
white_url = &#39;<a href="https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;">https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv&#39;</a>
red_wine = pd.read_csv(red_url, sep=&#39;;&#39;)
white_wine = pd.read_csv(white_url, sep=&#39;;&#39;)</p>
<p>red_wine[&#39;color&#39;] = 1.
white_wine[&#39;color&#39;] = 0.</p>
<p>wine = pd.concat([red_wine, white_wine])</p>
<h1 id="2-맛-분류를-위한-데이터-정리-1">2) 맛 분류를 위한 데이터 정리</h1>
<p>wine[&#39;taste&#39;] = [1. if grade &gt; 5 else 0 for grade in wine[&#39;quality&#39;]]</p>
<p>X = wine.drop([&#39;taste&#39;,&#39;quality&#39;], axis= 1)
y = wine[&#39;taste&#39;]</p>
<pre><code>
### 1) GridSearchCV
- 매번 하이퍼파라미터를 수정할 순 없음
- 예를 들어 pipeline을 5개 만든 경우, 하이퍼파라미터를 수정해야 하는 경우의 수는 엄청남
- 그래서, 수정할 파라미터를 지정 -&gt; GridSearchCV(분류기)에 알아서 cv=5겹으로 fit해라는 명령인 &quot;GridSearchCV&quot;를 이용
- (참고)https://blog.naver.com/dalgoon02121/222103377185

</code></pre><h1 id="모듈-2">모듈</h1>
<p>from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier</p>
<h1 id="파라미터-지정">파라미터 지정</h1>
<p>params = {&#39;max_depth&#39;:[2,4,7,10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)</p>
<h1 id="변수에--gridsearchcv분류기로-wine_tree를-지정-param_grid는-params로-5겹으로">변수에 = GridSearchCV(분류기로 wine_tree를 지정, param_grid는 params로, 5겹으로)</h1>
<p>gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)</p>
<h1 id="학습-split을-쓰지-않아도-됨">학습 (split을 쓰지 않아도 됨)</h1>
<p>gridsearch.fit(X,y)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/b4f4da6c-e611-457f-a7ac-9c7f3db8b47d/image.png)
</code></pre><h1 id="결과-확인">결과 확인</h1>
<h1 id="pprint는-데이터를-보기-좋게-출력pretty-print할-때-사용하는-모듈">pprint는 데이터를 보기 좋게 출력(pretty print)할 때 사용하는 모듈</h1>
<p>import pprint</p>
<p>pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/1aa1f92d-234f-448b-8f84-4a0f0d225d2e/image.png)


### 2) 데이터 관찰</code></pre><h1 id="최고의-성능을-가진-모델">최고의 성능을 가진 모델</h1>
<p>gridsearch.best_estimator_</p>
<h1 id="결과--max_depth2-일때">결과 : max_depth=2 일때</h1>
<pre><code></code></pre><h1 id="최고-점수">최고 점수</h1>
<p>gridsearch.best_score_</p>
<h1 id="결과--69">결과 : 69%</h1>
<pre><code></code></pre><h1 id="최고-파라미터">최고 파라미터</h1>
<p>gridsearch.best_params_</p>
<h1 id="결과--max_depth-2">결과 : {&#39;max_depth&#39;: 2}</h1>
<pre><code>### 3) pipeline + GridSearchCV</code></pre><h1 id="pipeline-생성-모델">pipeline 생성 모델</h1>
<p>from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler</p>
<p>estimators = [
    (&#39;scaler&#39;, StandardScaler()),
    (&#39;clf&#39;, DecisionTreeClassifier(random_state=13))
]</p>
<p>pipe = Pipeline(estimators)</p>
<pre><code></code></pre><h1 id="param-지정">param 지정</h1>
<p>param_grid = [{&#39;clf__max_depth&#39;:[2,4,7,10]}]</p>
<h1 id="gridsearchcv">GridSearchCV</h1>
<p>GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)</p>
<h1 id="fit-4">fit</h1>
<p>GridSearch.fit(X,y)</p>
<pre><code>- ![](https://velog.velcdn.com/images/jaam_mini/post/2bbac17c-ee51-45ae-983c-3c2ec9d6f254/image.png)

### 4) DataFrame으로 예쁘게 정리
</code></pre><p>import pandas as pd</p>
<h1 id="gridsearch-변수에-cv_results_를-호출">GridSearch 변수에 cv_results_를 호출</h1>
<h1 id="cv_results_--파라미터-조합별-결과-조회">cv_results_ : 파라미터 조합별 결과 조회</h1>
<p>score_df = pd.DataFrame(GridSearch.cv_results_)
score_df</p>
<pre><code></code></pre><h1 id="보고-싶은-컬럼들만-확인">보고 싶은 컬럼들만 확인</h1>
<p>score_df[[&#39;params&#39;, &#39;rank_test_score&#39;, &#39;mean_test_score&#39;, &#39;std_test_score&#39;]]</p>
<pre><code>
































`
</code></pre>]]></description>
        </item>
        <item>
            <title><![CDATA[Tableau - (과제2) 주유소 평균 가격]]></title>
            <link>https://velog.io/@jaam_mini/Tableau-%EA%B3%BC%EC%A0%9C2-%EC%A3%BC%EC%9C%A0%EC%86%8C-%ED%8F%89%EA%B7%A0-%EA%B0%80%EA%B2%A9</link>
            <guid>https://velog.io/@jaam_mini/Tableau-%EA%B3%BC%EC%A0%9C2-%EC%A3%BC%EC%9C%A0%EC%86%8C-%ED%8F%89%EA%B7%A0-%EA%B0%80%EA%B2%A9</guid>
            <pubDate>Sun, 21 Jan 2024 16:51:37 GMT</pubDate>
            <description><![CDATA[<h1 id="1-지역별-평균-가격">1. 지역별 평균 가격</h1>
<hr>
<h3 id="1_self-계산-필드">1_self (계산 필드)</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/643a3d79-f986-4d16-bab6-6a6c544f588b/image.png" alt=""></li>
</ul>
<h3 id="2_휘발유-가격">2_휘발유 가격</h3>
<ul>
<li>매개변수<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/ba5d7bf4-cbec-419c-862f-fbd1402205a6/image.png" alt=""></li>
</ul>
</li>
</ul>
<ul>
<li>계산된 필드<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/d3d4ea91-291c-44d8-9848-2966fc9e3284/image.png" alt=""></li>
</ul>
</li>
</ul>
<h3 id="3_완성">3_완성</h3>
<p>조금 다르지만 일단 완성..</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/2d8389ae-a5f1-4ccb-84c8-71ef8ba2ac13/image.png" alt=""></li>
</ul>
<p></br></br></br></p>
<h1 id="2-거리">2. 거리</h1>
<hr>
<h3 id="1_시설정보">1_시설정보</h3>
<ul>
<li><p>매개변수</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/53b02283-6594-42c1-9f12-3f9b33aa6f8b/image.png" alt=""></li>
</ul>
</li>
<li><p>계산필드</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/ce56f113-6802-4dce-8ede-7380f2e08219/image.png" alt=""></li>
</ul>
</li>
</ul>
<h3 id="2_경도위도">2_경도/위도</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/d032dd6b-9fe1-4dac-a11b-e61e1e4f59a7/image.png" alt=""></li>
</ul>
<h3 id="3_완성-1">3_완성</h3>
<p>드디어 배경을 거리맵으로 하는 방법을 찾아서 적용했다!
미리 알았더라면 이디야 답안도 이렇게 제출했을텐데... piblic은 처음부터 새롭게 만들어야 해서...주유소 문제만이라도 답안과 동일하게 만들기로..!
(지금이라도 찾아서 다행이라고 생각한다 😉)</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/f9337bf7-fc92-4ecc-92b9-7eb581708524/image.png" alt=""></li>
</ul>
<p></br></br></br></p>
<h1 id="3-휘발유경유-최저최고">3. 휘발유/경유 최저/최고</h1>
<hr>
<h3 id="1_휘발유-최저">1_휘발유 최저</h3>
<p>최대한 답안지와 비슷하게 만들었다...</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/172f4a65-e226-4c79-a672-16ff3c0a0222/image.png" alt=""></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/55b8bee9-ac83-4ebf-98ba-44b329855741/image.png" alt=""></li>
</ul>
<h3 id="2_휘발유-최고">2_휘발유 최고</h3>
<p>최저 sheet를 복사해서 수정!</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/109932a0-24c7-4b1c-a605-1fd1f4442aa3/image.png" alt=""></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a617f5ca-ac38-43c3-a716-71f06386bcce/image.png" alt=""></li>
</ul>
<h3 id="3_경유-최저">3_경유 최저</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a2123904-acc7-4a65-92cb-863660642fe8/image.png" alt=""></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/2836b187-787c-44cc-9be7-f7146ac34df8/image.png" alt=""></li>
</ul>
<h3 id="4_경유-최고">4_경유 최고</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/cc891a29-37f9-454d-9980-46db2de4192f/image.png" alt=""></li>
</ul>
<p></br></br></br></p>
<h1 id="4-최종">4. 최종</h1>
<hr>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/7a3c0aaf-bc31-41b9-a20b-2bc7616acc6a/image.png" alt=""></p>
<p><a href="https://public.tableau.com/shared/59KH8ZR5J?:display_count=n&amp;:origin=viz_share_link">https://public.tableau.com/shared/59KH8ZR5J?:display_count=n&amp;:origin=viz_share_link</a></p>
]]></description>
        </item>
        <item>
            <title><![CDATA[Tableau - (과제1) 이디야 스타벅스 매장 간 거리 _ 답안 추가]]></title>
            <link>https://velog.io/@jaam_mini/Tableau-%EA%B3%BC%EC%A0%9C1-%EC%9D%B4%EB%94%94%EC%95%BC-%EC%8A%A4%ED%83%80%EB%B2%85%EC%8A%A4-%EB%A7%A4%EC%9E%A5-%EA%B0%84%EC%9D%98-%EA%B1%B0%EB%A6%AC-%ED%99%95%EC%9D%B8</link>
            <guid>https://velog.io/@jaam_mini/Tableau-%EA%B3%BC%EC%A0%9C1-%EC%9D%B4%EB%94%94%EC%95%BC-%EC%8A%A4%ED%83%80%EB%B2%85%EC%8A%A4-%EB%A7%A4%EC%9E%A5-%EA%B0%84%EC%9D%98-%EA%B1%B0%EB%A6%AC-%ED%99%95%EC%9D%B8</guid>
            <pubDate>Fri, 19 Jan 2024 04:13:50 GMT</pubDate>
            <description><![CDATA[<h1 id="1-기본-설정-세팅">1. 기본 설정 세팅</h1>
<hr>
<h3 id="1_매장간-거리">1_매장간 거리</h3>
<ul>
<li>MAKEPOINT는 위도와 경도로 구성된 공간 개체를 반환해줌</li>
<li>DISTANCE는 (시작, 끝, 단위)로 이뤄져 있음</li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/0c2d8232-f8b0-4114-a2c7-765abf4e42c2/image.png" alt=""></li>
</ul>
<h3 id="2_이디야-매장수">2_이디야 매장수</h3>
<ul>
<li>COUNTD() 사용</li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/61e80bba-fb5c-491b-9944-411a2b2c7641/image.png" alt=""></li>
</ul>
<h3 id="3_meters-away-매개변수">3_Meters Away (매개변수)</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a40cf2bc-790d-40fc-8dae-92e1a24811b9/image.png" alt=""></li>
</ul>
<h3 id="4_거리-설정">4_거리 설정</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/075a4d23-213b-4547-8c54-1dd368a4a265/image.png" alt=""></li>
</ul>
<p></br></br></br></p>
<h1 id="2-그래프-만들기">2. 그래프 만들기</h1>
<hr>
<h3 id="1_매장수">1_매장수</h3>
<p>답안지와 동일한 그래프로 만들기 위해 표 선(그리드)를 찾는데 한참 걸렸다...</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/38b48380-50e2-4d43-ac35-0543b13ae829/image.png" alt=""></li>
</ul>
<h3 id="2_비중">2_비중</h3>
<p>[매장수] 시트를 복제해서 사용</p>
<p>Meters Away 위의 Total을 [숨기기]처리 했다. 
[이항목만유지]하는 경우 비중이 100%로 뜨기 때문!</p>
<p>[테이블(아래로)] 도 잊지 말아야 한다!</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/8ce8b1a7-c767-4a63-9814-8bc42e9bd329/image.png" alt=""></li>
</ul>
<h3 id="3_매장수네모박스">3_매장수(네모박스)</h3>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/52484400-ed85-45e0-944f-acdbcf499ffe/image.png" alt=""></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/649f232b-275e-46d6-b358-4064fb7b9253/image.png" alt=""></li>
</ul>
<h3 id="4_지도맵">4_지도맵</h3>
<p>먼저 경도, 위도를 생성한다</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/93bc4531-ae27-4be6-93d5-f9086e1c9fb7/image.png" alt=""></li>
</ul>
<p>(시군구)를 그래프에 나타내기 위해 (이전 학습 rawdata)를 불러오면 아래와 같은 경고 문구가 뜬다</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/3f97c819-fefc-4025-b405-5b97a95f2a39/image.png" alt=""></li>
</ul>
<p>이를 해결하기 위해 [데이터를 유니온 해줬다]
컬럼은 [E Gu] = [시군구] 로 맞춰줘야 한다!</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/98940e22-994c-466b-a458-f6ee1371638b/image.png" alt=""></li>
</ul>
<p>2시간 걸린...결과물</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/f84088d9-5022-4102-845e-2aa109941ad7/image.png" alt=""></li>
</ul>
<h3 id="5_거리">5_거리</h3>
<p>정.말 어떻게 구성해야 할지 한시간 고민한 sheet 였다 🙄</p>
<p>1) E name &gt; 레이블 &gt; 측정값 &gt; 카운트</p>
<p>2) Buffer 생성
이 함수를 다들 어떻게 찾은 걸까..?</p>
<ul>
<li>스타벅스의 위도와 경도를 기준으로 매개변수 &#39;거리&#39; 만큼 &#39;meter&#39;단위로 원을 그리는 함수</li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/f23b8caa-c23b-4055-b004-32bf9eeb00e0/image.png" alt=""></li>
</ul>
<p>3) MAKEPOINT()
이디야 매장수를 찍기 위해서 생성</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/10efffd1-000c-4a6d-93af-db755b883608/image.png" alt=""></li>
</ul>
<p>4) 이중축</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/978b15db-0b2f-4a30-b6f2-d36e7ace9788/image.png" alt=""></li>
</ul>
<p>5) 스타벅스와의 거리</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/1f2b5175-d2b1-4f43-9ede-f4fe5dbef45e/image.png" alt=""></li>
</ul>
<p>6) 이디야 포인트</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/deaad6dc-a237-405a-a719-5ad18d249f8e/image.png" alt=""></li>
</ul>
<p>답지와 동일한 지도로 하고 싶었지만, 해결하지 못했다..</p>
<ul>
<li>답지 지도
<img src="https://velog.velcdn.com/images/jaam_mini/post/7e817edd-dafc-4def-aea6-ca5e0e031656/image.png" alt=""></li>
</ul>
<p></br></br></br></p>
<h1 id="3-대시보드">3. 대시보드</h1>
<hr>
<p>1) 동작 설정
왼쪽 지도에서 구를 선택 시, 오른쪽 지도에서 해당 구가 필터링 되게 해야 한다</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/23ce273e-a2b5-4057-ba3d-9bf9a01064ad/image.png" alt=""></li>
</ul>
<p>2) 하이라이트 생성
위 그래프 선택 시 아래 [비중]이 하이라이트되게 해야 한다</p>
<ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/b2e95c8f-cf81-4104-bac4-1ee38f59b20b/image.png" alt=""></li>
</ul>
<h1 id="4-결과물">4. 결과물</h1>
<hr>
<ul>
<li><a href="https://public.tableau.com/shared/C99FQK3J4?:display_count=n&amp;:origin=viz_share_link">https://public.tableau.com/shared/C99FQK3J4?:display_count=n&amp;:origin=viz_share_link</a> </li>
</ul>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/9d53505c-5830-4bfa-8f14-91853db8b4b2/image.png" alt=""></p>
<p>Tableau가 쉬우니 금방 따라갈 거라던 내 친구들의 말이 무색하게 굉-장히 힘들게 과제를 풀었다...
결과물도 답안과 조금 달라 속상하지만 ! 최선을 다했음에 만족..!</p>
<p></br></br></br></br></br></br></br></p>
<h1 id="5😆답안지">5.😆답안지</h1>
<hr>
<blockquote>
<h3 id="전체-매장수">전체 매장수</h3>
</blockquote>
<h3 id="📌-기본-설정br">📌 기본 설정</BR></h3>
<ol>
<li>스타벅스 - 이디야 매장간 거리 계산을 위한 매장별 위치값 필요 : <code>MAKEPOINT</code> 이용</li>
</ol>
<ul>
<li>스타벅스 위치값<ul>
<li>필드명 : S_LOCATION</li>
<li>MAKEPOINT([스타벅스 위도], [경도])</li>
</ul>
</li>
<li>이디야 위치값<ul>
<li>필드명 : E_LOCATION</li>
<li>MAKEPOINT([이디야 위도], [경도]) </BR></li>
</ul>
</li>
</ul>
<ol start="2">
<li>스벅/이디야 매장간 거리</li>
</ol>
<ul>
<li>필드명 : DISTANCE</li>
<li>DISTANCE([S_LOCATION], [E_LOCATION], &quot;M&quot;)</BR></li>
</ul>
<ol start="3">
<li>이디야 매장수</li>
</ol>
<ul>
<li>필드명 : &#39;# OF EDIYA<ul>
<li>COUNT([E ID])</BR></li>
</ul>
</li>
</ul>
<ol start="4">
<li>특정 거리내에 위치한 매장에 참/거짓 달기</li>
</ol>
<ul>
<li>필드명 : T/F_METERS AWAY<ul>
<li>[METERS AWAY]보다 작거나 같으면 &#39;METERS WAY&#39;, 아닌 경우 &#39;TOTAL&#39;</li>
<li>IIF([DISTANCE] &lt;= [METERS AWAY], &#39;METERS WAY&#39;, &#39;TOTAL&#39;)</li>
</ul>
</li>
<li>매개변수 : METERS AWAY
<img src="https://velog.velcdn.com/images/jaam_mini/post/95d0f56e-6c93-404b-9ea2-2642b4d84d12/image.png" alt=""></li>
</ul>
</BR>

<h3 id="📌-그래프">📌 그래프</h3>
<ul>
<li>이디야 매장정보를 활용할거라서, E EU &gt; 열 선반</li>
<li>T/F_METERS AWAY &gt; 행렬 선반 &gt; 내림차순</li>
<li>&#39;# OF EDIYA &gt; 마크 &gt; 텍스트</li>
<li>METERS AWAY &gt; 우클릭 &gt; 매개변수 사용</li>
</ul>
<p></BR></BR></p>
<blockquote>
<h3 id="거리내-매장수">거리내 매장수</h3>
</blockquote>
<h3 id="📌-그래프-1">📌 그래프</h3>
<ul>
<li>전체 매장수를 볼꺼기 때문에 T/F_METERS AWAY &gt; 필터 &gt; 거리 내 매장수만 ㅛ시할 예정으로 &#39;METERS AWAY&#39; 체크</li>
<li>&#39;# OF EDIYA &gt; 마크 &gt; 텍스트</li>
</ul>
<p></BR></BR></p>
<blockquote>
<h3 id="비중">비중</h3>
</blockquote>
<h3 id="📌-그래프-2">📌 그래프</h3>
<ul>
<li>[전체 매장수] 복제 </li>
<li>&#39;# OF EDIYA &gt; 퀵테이블계산 &gt; 구성비율 &gt; 우클릭 &gt; 다음을 사용해 계산 &gt; 테이블 아래 &gt; 우큵 &gt; 서식 &gt; 숫자 &#39;백분율&#39;,&#39;소수점 0&#39;</li>
</ul>
<p></BR></BR></p>
<blockquote>
<h3 id="시군구-맵차트">시군구 맵차트</h3>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/8b5de257-a919-4aaa-b976-39006a9fadea/image.png" alt=""></p>
</blockquote>
<h3 id="📌-기본-설정">📌 기본 설정</h3>
<ul>
<li>시군구 정보만 가지고 있는 별도의 필드가 없는 상태</li>
<li>SPLIT을 사용해 [시군구 필드]만들 예정<ul>
<li>이디야 매장 기준 지역별 매장수 확인 예정 &gt; [E ADDRESS]사용</li>
<li>[E ADDRESS] &gt; 우클릭 &gt; 변환 &gt; 사용자지정분할 &gt; &#39;구분기호&#39;: 띄어쓰기 1칸, &#39;분할해제&#39; 2열</li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/39973770-63c4-47e3-abfd-38280f99f355/image.png" alt=""></li>
</ul>
</li>
<li>[분할1] [분할2] 생성됨</li>
<li>[분할1] &gt; 우클릭 &gt; 편집 &gt; 필드 이름 : 시도 &gt; 지리적 역할 &gt; 주시도</li>
<li>[분할2] &gt; 우클릭 &gt; 편집 &gt; 필드 이름 : 시군구 &gt; 지리적 역할 &gt; 시군구</li>
<li>[시도], [시군구] 선택 &gt; 우클릭 &gt; 계층 &gt; 계층 만들기 &gt; [지역]으로 이름 짖기</li>
<li>⭐ [E GU]를 세부정보에 추가해야 함, 이후 대시보드 동작 시 원본/대상 시트가 같은 정보를 갖고 있어야 동작기능이 정상적으로 작동하기 때문 ; [E GU] &gt; 세부정보 로 드래그</li>
</ul>
<h3 id="📌-그래프-3">📌 그래프</h3>
<ul>
<li>경도, 위도 &gt; 더블클릭</li>
<li>시군구 &gt; 마크 &gt; 맵 으로 변경</li>
</ul>
<p>이제 스타벅스 기준, 특정거리 내에 있는 이디야 매장수 표시해야 함</p>
<ul>
<li>&#39;# OF EDIYA &gt; 마크</li>
<li>&#39;# OF EDIYA &gt; 색상 &gt; 오른쪽 상단 색상범례 더블클릭 &gt; &#39;남색&#39;으로 색상 변경</li>
<li>T/F_METERS AWAY &gt; 필터 &gt; &#39;METERS AWAY&#39;</li>
<li>시군구 &gt; 레이블 &gt; 텍스트 편집 &gt; </li>
<li>상단 &#39;맵&#39; 메뉴 &gt; 맵 계층 &gt; 백그라운드 투명도 100% 로 변경</li>
</ul>
<p></BR></BR></p>
<blockquote>
<h3 id="거리-맵차트">거리 맵차트</h3>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/b4f88e90-7e5e-462b-b72b-14927549ad5f/image.png" alt=""></p>
</blockquote>
<p><code>BUFFER</code> 함수를 써서 특정 위치를 중심으로 지정된 거리 만큼 원으로 표시되게 할 것임</p>
<p>스타벅스를 기준으로 &#39;매개변수&#39; 거리 만큼 <code>BUFFER</code>를 만들 것임</p>
<h3 id="📌-기본-설정-1">📌 기본 설정</h3>
<ul>
<li>필드이름 : S_BUFFER</li>
<li>BUFFER([S_LOCATINO], [METERS AWAY], &quot;M&quot;)</li>
</ul>
<h3 id="📌-그래프-4">📌 그래프</h3>
<p>스타벅스 매장 위치를 표시할 것임</p>
<ul>
<li>위도, 경도 추가</li>
<li>S_BUFFER 더블클릭</li>
<li>⭐ S_NAME &gt; 마크로 드래그 : 하나로 뭉쳐있는  S_BUFFER를 나눠주는 기능 </li>
<li>S_BRAND &gt; 색상 &gt; 색상범례 더블클릭 &gt; 진한 초록색</li>
</ul>
</BR>

<p>이디야 매장 위치를 표시할 것임</p>
<ul>
<li>위도 + CTRL &gt; 옆으로 복제</li>
<li>기존 마크에 있던 모든 것을 빼줌</li>
<li>E_LOCATINO 더블클릭</li>
<li>E_NAME &gt; 세부정보 드래그</li>
<li>E_BRAND &gt; 색상 &gt; 빨간색으로 변경</li>
<li>오른쪽 위도 &gt; 우클릭 &gt; 이중축</li>
</ul>
</BR>

<p>특정 거리 내에 위치한 이디야 매장만 확인하면 됨
관련 없는 매장은 없애기 위해</p>
<ul>
<li>T/F_METERS AWAY &gt; 필터 &gt; METERS AWAY</li>
</ul>
</BR>

<p>이디야 매장수를 숫자로 표시하기 위해</p>
<ul>
<li><p>T/F_METERS AWAY 복제 &gt; 우클릭 &gt; 편집</p>
<ul>
<li>필드명 : &#39;# OF EDIYA_HIDDEN</li>
<li>합계(참인 경우 숫자 1, 거짓인 경우 0)</li>
<li>SUM(IIF([DISTANCE] &lt;= [METERS AWAY], &#39;1&#39;, &#39;0&#39;))</li>
</ul>
</li>
<li><p>&#39;# OF EDIYA_HIDDEN &gt; 우클릭 &gt; 연속형 으로 변경 &gt; 왼쪽 위도 레이블에 드래그 &gt; 우클릭 &gt; 서식 &gt; 글자 크기 변경</p>
</li>
</ul>
</BR>

<p>맵 예쁘게 변경하기</p>
<ul>
<li>상단 &#39;멥&#39; &gt; 배경맵 &gt; 거리 선택</li>
</ul>
<p></BR></BR></p>
<blockquote>
<h3 id="대시보드">대시보드</h3>
</blockquote>
<ul>
<li><p>대시보드 &gt; 동작 &gt; 필터 
<img src="https://velog.velcdn.com/images/jaam_mini/post/d0d9babd-2bf9-45fe-b9e1-679ec7ce2355/image.png" alt=""></p>
</li>
<li><p>대시보드 &gt; 동작 &gt; 하이라이트
<img src="https://velog.velcdn.com/images/jaam_mini/post/1397e00f-df22-4d6c-abd3-69839afb47a7/image.png" alt=""></p>
</li>
</ul>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 3. label_encoder , min-max scaling , Standard , Robust Scaler]]></title>
            <link>https://velog.io/@jaam_mini/ML-3.-labelencoder-min-max-scaling-Standard-Robust-Scaler</link>
            <guid>https://velog.io/@jaam_mini/ML-3.-labelencoder-min-max-scaling-Standard-Robust-Scaler</guid>
            <pubDate>Wed, 17 Jan 2024 11:13:09 GMT</pubDate>
            <description><![CDATA[<h1 id="1-label_encoder">1. label_encoder</h1>
<hr>
<pre><code>import pandas as pd

df = pd.DataFrame({
    &#39;A&#39; : [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;a&#39;, &#39;b&#39;],
    &#39;B&#39; : [1,2,3,1,0]
})

df</code></pre><h3 id="1-fit--transform-문자---숫자">1) fit ~ transform (문자 -&gt; 숫자</h3>
<pre><code>from sklearn.preprocessing import LabelEncoder

# 변수 설정
le = LabelEncoder()

# (1) 학습(df의 A컬럼을 기준으로)
le.fit(df[&#39;A&#39;])</code></pre><pre><code># (2) 잘 학습되었는지 확인
le.classes_</code></pre><p>array([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=object)</p>
<pre><code># (3) transformation (fit 이후 해야 함)
le.transform(df[&#39;A&#39;])</code></pre><p>array([0, 1, 2, 0, 1])</p>
<pre><code># (4) transformation 값, 컬럼에 넣기
df[&#39;le_A&#39;] = le.transform(df[&#39;A&#39;])</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/79cbbc47-f7f3-4777-ae4f-af9790b668de/image.png" alt=""></li>
</ul>
<h3 id="2-fittransform">2) fit+transform</h3>
<pre><code>le.fit_transform(df[&#39;A&#39;])</code></pre><p>array([0, 1, 2, 0, 1])</p>
<h3 id="3-답-물어보기">3) 답 물어보기</h3>
<pre><code>le.transform([&#39;a&#39;])</code></pre><p>array([0])</p>
<h3 id="4-역변환-문자---숫자">4) 역변환 (문자 -&gt; 숫자)</h3>
<pre><code>le.inverse_transform(df[&#39;le_A&#39;])</code></pre><p>array([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;a&#39;, &#39;b&#39;], dtype=object)</p>
<h1 id="2-min-max-scaling-정규화">2. min-max scaling (정규화)</h1>
<p>(min)은 0으로, (max)는 1로 만들어 줌</p>
<hr>
<pre><code>df = pd.DataFrame({
    &#39;A&#39; : [10,20,-10,0,25],
    &#39;B&#39; : [1,2,3,1,0]
})

df</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/e44da7ab-e1fe-487f-8a54-eebc79340c01/image.png" alt=""></li>
</ul>
<h3 id="1-fit">1) fit</h3>
<pre><code># MinMaxScaler 모듈
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
mms.fit(df)</code></pre><h3 id="2-데이터-확인">2) 데이터 확인</h3>
<pre><code># data_range_ : 분모 역할(전체 길이)
mms.data_max_, mms.data_min_, mms.data_range_</code></pre><p>(array([25.,  3.]), array([-10.,   0.]), array([35.,  3.]))</p>
<h3 id="3-transform">3) transform</h3>
<ul>
<li>(min)은 0으로, (max)는 1로 만들어 줌<pre><code>df_mms = mms.transform(df)
df_mms</code></pre></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/d15ae64c-abbd-4198-b3f1-c85743c5042f/image.png" alt=""></li>
</ul>
<h3 id="4-역변환">4) 역변환</h3>
<pre><code>mms.inverse_transform(df_mms)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/18c31675-25fc-4c79-8f65-950d3f887340/image.png" alt=""></li>
</ul>
<h3 id="5-한번에">5) 한번에~</h3>
<pre><code>mms.fit_transform(df)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/d24579e3-8f71-4c12-88de-d46f12e6ba52/image.png" alt=""></li>
</ul>
<h1 id="3-standard-scaler-표준화">3. Standard Scaler (표준화)</h1>
<p>표준정규분포 (표준을 빼고 편차로 나눠주는~)</p>
<hr>
<h3 id="1-fit-1">1) fit</h3>
<pre><code>from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(df)</code></pre><h3 id="2-표준편차-작동-인자">2) 표준편차 작동 인자</h3>
<ul>
<li>(분모) 표준편차</li>
<li>(분자) 평균<pre><code># 평균, 표준편차
ss.mean_, ss.scale_</code></pre>(array([9. , 1.4]), array([12.80624847,  1.0198039 ]))</li>
</ul>
<h3 id="3-transform-1">3) transform</h3>
<pre><code>df_ss = ss.transform(df)
df_ss</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/02f5cc26-ae13-495e-8cdd-f420951d224f/image.png" alt=""></li>
</ul>
<h3 id="4-한번에">4) 한번에~</h3>
<pre><code>ss.fit_transform(df)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/bdce6855-d4b4-46b1-9286-0aa10aa2bbee/image.png" alt=""></li>
</ul>
<h1 id="4-robust-scaler">4. Robust Scaler</h1>
<hr>
<pre><code>df = pd.DataFrame({
    &#39;A&#39; : [-0.1,0.,0.1,0.2,0.3,0.4,1.0,1.1,5]
})

df</code></pre><h3 id="1-3가지-모듈을-한번에-적용">1) 3가지 모듈을 한번에 적용</h3>
<pre><code>from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

mm = MinMaxScaler()
ss = StandardScaler()
rs = RobustScaler()</code></pre><h3 id="2-fit_transform--컬럼-추가">2) fit_transform + 컬럼 추가</h3>
<pre><code>df_scaler = df.copy()

df_scaler[&#39;MinMax&#39;] = mm.fit_transform(df)
df_scaler[&#39;Standard&#39;] = ss.fit_transform(df)
df_scaler[&#39;Robust&#39;] = rs.fit_transform(df)

df_scaler</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/c78cdcc3-43e1-4f58-8a66-1ca699c3d5ec/image.png" alt=""></li>
</ul>
<h3 id="3-이해를-위해-boxplot">3) 이해를 위해 Boxplot</h3>
<pre><code>import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,6))
sns.set_theme(style=&#39;whitegrid&#39;)
sns.boxplot(data=df_scaler, orient=&#39;h&#39;) #orient=&#39;h&#39;:수평bar</code></pre><ul>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/c337549c-4e1d-4d69-a1a0-5e523242b161/image.png" alt=""></p>
</li>
<li><p>A 안에는 0.1 기준으로 증감하는 데이터들 사이에 5라는 아웃라이어가 있음</p>
</li>
<li><p>5라는 아웃라이어 때문에 MinMaxScaler 결과 한쪽으로 치우치게 됨</p>
</li>
<li><p>MinMaxScaler는 아웃라이어의 영향을 받으면 데이터가 이상해질 수 있음</p>
</li>
<li><p>평균과 중앙값을 쓸때, 평균 이상치를 반영하고, 중앙값은 이상치 영향을 덜 받게 됨</p>
</li>
<li><p>StandardSCaler를 확인했을 때, 평균이 반영되어 대다수의 데이터가 왼쪽으로 치우침</p>
</li>
<li><p>RobustSCaler는 median이 0이 되고, 아웃라이어는 그대로 유지되며 데이터에 영향을 크게 주지 않음</p>
</li>
</ul>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 2. Titanic 생존 분석 _ titanic disaster kaggle]]></title>
            <link>https://velog.io/@jaam_mini/ML-2.-Titanic-%EC%83%9D%EC%A1%B4-%EB%B6%84%EC%84%9D-titanic-disaster-kaggle</link>
            <guid>https://velog.io/@jaam_mini/ML-2.-Titanic-%EC%83%9D%EC%A1%B4-%EB%B6%84%EC%84%9D-titanic-disaster-kaggle</guid>
            <pubDate>Wed, 17 Jan 2024 08:44:48 GMT</pubDate>
            <description><![CDATA[<pre><code># !pip install plotly_express</code></pre><pre><code>import pandas as pd

titanic_url = &#39;http://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/titanic.xls&#39;
titanic = pd.read_excel(titanic_url)
titanic.head(2)</code></pre><p></br></br></br></p>
<h1 id="1-데이터-정리">1. 데이터 정리</h1>
<hr>
</br>

<h3 id="생존상황-확인">생존상황 확인</h3>
<pre><code>import matplotlib.pyplot as plt
import seaborn as sns</code></pre><pre><code># plt.subplots : 그래프 2개 한번에 그리기 (1행, 2열로)
f, ax = plt.subplots(1,2,figsize=(18,8))

# titanic[&#39;survived&#39;].value_counts() : [0]비생존자, [1]생존자
# .plot.pie : 동그란 그래프로 그리자
titanic[&#39;survived&#39;].value_counts().plot.pie(
    explode=[0, 0.05], # 조각들 멀어지기
    autopct=&#39;%1.1f%%&#39;, # 소수점 첫째 자리까지 수치 입력
    ax=ax[0], # ax를 첫번째로 그려줘
    shadow=True # 그림자 생김
)
ax[0].set_title(&#39;Pie plot - survived&#39;)
ax[0].set_ylabel(&#39;&#39;)

sns.countplot(x=&#39;survived&#39;, data=titanic, ax=ax[1])
ax[1].set_title(&#39;Count plot - survived&#39;)

plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/aea0a3a8-7888-4ab9-9ff3-852479b361ba/image.png" alt=""></li>
</ul>
</br>

<h3 id="2-성별">2) 성별</h3>
<pre><code>f, ax = plt.subplots(1,2,figsize=(18,8))

sns.countplot(x=&#39;sex&#39;, data=titanic, ax=ax[0])
ax[0].set_title(&#39;Count of Passengers of Sex&#39;)
ax[0].set_ylabel(&#39;&#39;)

sns.countplot(x=&#39;sex&#39;, data=titanic, hue=&#39;survived&#39;, ax=ax[1])
ax[1].set_title(&#39;Sex ; survived and Unsurvived&#39;)

plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/230c5806-86b9-4012-be0b-94b501c845b5/image.png" alt=""></li>
</ul>
</br>

<h3 id="3-경제력">3) 경제력</h3>
<ul>
<li>crosstab : 2번째 컬럼을 구분지어 주고, 인덱스에 1번째 컬럼을 담아 줌</li>
<li>margins=True : 합계<pre><code>pd.crosstab(titanic[&#39;pclass&#39;], titanic[&#39;survived&#39;], margins=True)</code></pre></li>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/6e7e81f9-a6c1-4b79-954d-25a684c4e4ec/image.png" alt=""></li>
</ul>
</br>

<h3 id="4-등급성별-_facetgrid">4) 등급/성별 _FacetGrid</h3>
<pre><code># FacetGrid(변수 지정, 행, 컬럼, 높이, 넓이)
grid = sns.FacetGrid(titanic, row=&#39;pclass&#39;, col=&#39;sex&#39;, height=4, aspect=2)

# hist을 넣어라, 나이를 기준으로, 투명도는 0.8로, 수평축의 간격)
grid.map(plt.hist, &#39;age&#39;, alpha=.8, bins=20)

# 각 격자 안의 색상에 대한 범레 지정
grid.add_legend();</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/af2a7919-80a4-4777-ad66-b359165f2d27/image.png" alt=""></li>
</ul>
</br>

<h3 id="5-나이">5) 나이</h3>
<pre><code>import plotly.express as px

# px에 히스토그램을 그려줘(데이터는 타이타닉, 컬럼은 나이)
fig = px.histogram(titanic, x=&#39;age&#39;)
fig.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/84248c5b-6043-4297-963b-ebb59aebc24e/image.png" alt=""></li>
</ul>
</br>

<h3 id="6-선실-등급별">6) 선실 등급별</h3>
<pre><code>grid = sns.FacetGrid(titanic, col=&#39;survived&#39;, row=&#39;pclass&#39;, height=4, aspect=2)
grid.map(plt.hist, &#39;age&#39;, alpha=.5, bins=20)
grid.add_legend();</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/eb57d5ff-2512-4708-8e88-d434d77decd8/image.png" alt=""></li>
</ul>
</br>

<h3 id="7-나이-5단계-정리">7) 나이 5단계 정리</h3>
<pre><code># age_cat :라는 새로운 (컬럼) 만들
titanic[&#39;age_cat&#39;] = pd.cut(
    titanic[&#39;age&#39;], # titanic 데이터의 age 컬 
    bins = [0,7,15,30,60,100],
    include_lowest = True,
    labels = [&#39;baby&#39;, &#39;teen&#39;, &#39;young&#39;, &#39;adult&#39;, &#39;old&#39;]
    # 0-7:baby, 7-15:teen, 15-30:young, 30-60:adult, 60-100:old
)

titanic.head(2)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/5fff56ae-15e4-496a-805e-2e8c03f89a36/image.png" alt=""></li>
</ul>
</br>

<h3 id="8-나이성별등급">8) 나이/성별/등급</h3>
<pre><code>plt.figure(figsize=(12,4))

plt.subplot(131) #1행3열 중 1번째
sns.barplot(x=&#39;pclass&#39;, y=&#39;survived&#39;, data=titanic)

plt.subplot(132)
sns.barplot(x=&#39;age_cat&#39;, y=&#39;survived&#39;, data=titanic)

plt.subplot(133)
sns.barplot(x=&#39;sex&#39;, y=&#39;survived&#39;, data=titanic)

# plt.subplots_adjust(top=1, bottom=0.1, left=0.1, right=1, hspace=0.5, wspace=0.5)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/85023d38-8715-4d17-ac55-4abaee1e58c1/image.png" alt=""></li>
</ul>
</br>

<h3 id="9-남여나이">9) 남여/나이</h3>
<pre><code># 남여의 나이별 생존 현황
f, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

women = titanic[titanic[&#39;sex&#39;] == &#39;female&#39;]
men = titanic[titanic[&#39;sex&#39;] == &#39;male&#39;]

sns.distplot(women[women[&#39;survived&#39;] == 1][&#39;age&#39;],
                  bins=20, label=&#39;survived&#39;,
                  ax=ax[0], kde=False)

sns.distplot(women[women[&#39;survived&#39;] == 0][&#39;age&#39;],
                  bins=40, label=&#39;not survived&#39;,
                  ax=ax[0], kde=False)

ax[0].legend()
ax[0].set_title(&#39;Female&#39;)

sns.distplot(men[men[&#39;survived&#39;] == 1][&#39;age&#39;],
                 bins=18, label=&#39;survived&#39;,
                 ax=ax[1], kde=False)

sns.distplot(men[men[&#39;survived&#39;] == 0][&#39;age&#39;],
                 bins=40, label=&#39;not survived&#39;,
                 ax=ax[1], kde=False)

ax[1].legend()
ax[1].set_title(&#39;Male&#39;)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/cf3b6f21-2342-4f31-be69-9c0f42b06cc9/image.png" alt=""></li>
</ul>
</br>

<h3 id="10-이름-신분">10) 이름-신분</h3>
<ul>
<li><p>(1) 데이터 확인</p>
<pre><code>import re

for idx, dataset in titanic.iterrows():
    tmp = dataset[&#39;name&#39;]
print(re.search(&#39;\,\s\w+(\s\w+)?\.&#39;, tmp).group())</code></pre><p>, Mr.</p>
</li>
<li><p>(2) 데이터 형태 가공</p>
<pre><code>import re

title = []
for idx, dataset in titanic.iterrows():
  tmp = dataset[&#39;name&#39;] # 일시저장

  # ,로 시작 - \s한칸을 띄우고 - \w글자들이 나오다가 - ?몇글자 인지 모르겠지만 - .으로 끝남
  # 대상은 tmp
  # [2:-1] 두번째 ~ 마지막 -&gt; , Mr.
  title.append(re.search(&#39;\,\s\w+(\s\w+)?\.&#39;, tmp).group()[2:-1])

title</code></pre><p>[&#39;Miss&#39;,
 &#39;Master&#39;,
 &#39;Miss&#39;,
 &#39;Mr&#39;,
 &#39;Mrs&#39;,
 &#39;Mr&#39;,</p>
</li>
<li><p>(3) 컬럼으로 추가</p>
<pre><code>import re

title = []
for idx, dataset in titanic.iterrows():
  tmp = dataset[&#39;name&#39;]
  title.append(re.search(&#39;\,\s\w+(\s\w+)?\.&#39;, tmp).group()[2:-1])

titanic[&#39;title&#39;] = title
titanic.head(1)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/78dd8115-49a5-46f4-a28f-02f3c02bf684/image.png" alt=""></li>
</ul>
</li>
<li><p>(4) 타이틀:인덱스, 성별:컬럼</p>
<pre><code>pd.crosstab(titanic[&#39;title&#39;], titanic[&#39;sex&#39;])</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/c0ea9b67-218d-41fc-9504-73cb19760d2a/image.png" alt=""></li>
</ul>
</li>
<li><p>(5-1) 호칭 정리</p>
<pre><code>titanic[&#39;title&#39;] = titanic[&#39;title&#39;].replace(&#39;Mlle&#39;, &#39;Miss&#39;)
titanic[&#39;title&#39;] = titanic[&#39;title&#39;].replace(&#39;Mme&#39;, &#39;Miss&#39;)
titanic[&#39;title&#39;] = titanic[&#39;title&#39;].replace(&#39;Ms&#39;, &#39;Miss&#39;)

#여성 귀족
Rare_f = [&#39;Dona&#39;, &#39;Lady&#39;, &#39;the Countess&#39;]
#남성 귀족
Rare_m = [&#39;Capt&#39;, &#39;Col&#39;, &#39;Don&#39;, &#39;Dr&#39;, &#39;Jonkheer&#39;,
          &#39;Major&#39;, &#39;Master&#39;, &#39;Rev&#39;, &#39;Sir&#39;]

for each in Rare_f:
  titanic[&#39;title&#39;] = titanic[&#39;title&#39;].replace(each, &#39;Rare_f&#39;)

for each in Rare_m:
  titanic[&#39;title&#39;] = titanic[&#39;title&#39;].replace(each, &#39;Rare_m&#39;)</code></pre></li>
<li><p>(5-2) 호칭 정리</p>
<pre><code>for each in Rare_f: # 여성용 호칭에서 하나씩 가져와서
    titanic[&#39;title&#39;] = titanic[&#39;title&#39;].replace(each, &#39;Rare_f&#39;) # 각가을 전부다 Rare_f로 바꾸겠다

for each in Rare_m: # 여성용 호칭에서 하나씩 가져와서
    titanic[&#39;title&#39;] = titanic[&#39;title&#39;].replace(each, &#39;Rare_m&#39;) # 각가을 전부다 Rare_f로 바꾸겠다</code></pre></li>
<li><p>(5-3) 호칭 확인</p>
<pre><code>titanic[&#39;title&#39;].unique()</code></pre><p>array([&#39;Miss&#39;, &#39;Rare_m&#39;, &#39;Mr&#39;, &#39;Mrs&#39;, &#39;Rare_f&#39;], dtype=object)</p>
</li>
<li><p>(5-4) groupby</p>
<pre><code># [groupby] https://trading-for-chicken.tistory.com/134
# 특정 열을 지정하여 groupby할 경우 해당 열이 인덱스가 되는데, as_index=False로 하여 기존 인덱스 유지
titanic[[&#39;title&#39;, &#39;survived&#39;]].groupby([&#39;title&#39;], as_index=False).mean()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/cfd4252a-cffe-49d6-9af6-70f7271176bb/image.png" alt=""></li>
</ul>
</li>
</ul>
</br>
</br>
</br>
</br>

<h1 id="2-ml을-이용한-주인공-생존율-예측">2. ML을 이용한 주인공 생존율 예측</h1>
</br>


<h2 id="⭐-1-str---int">⭐ 1) str -&gt; int</h2>
<p>머신 러닝을 위해서 <code>컬럼</code>은 모두 <code>숫자</code>로 바꿔야 함</p>
<pre><code># 컬럼들 형태 확인
titanic.info()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/50af9f32-8f9c-447a-96b7-fd45b05de712/image.png" alt=""></li>
</ul>
<pre><code>titanic[&#39;sex&#39;].unique()</code></pre><p>array([&#39;female&#39;, &#39;male&#39;], dtype=object)</p>
<pre><code># 숫자 형태로 변경
# 3   sex        1309 non-null   object 

# LabelEncoder 모듈
# 라벨 인코더 모듈 : 문자 -&gt; 숫자 로 만들어 주는 것
from sklearn.preprocessing import LabelEncoder

# 변수 지정
le = LabelEncoder()

# fit 훈련 : (데이터, 정답)
le.fit(titanic[&#39;sex&#39;])

# gender 컬럼 생성 + transform(변환)
titanic[&#39;gender&#39;] = le.transform(titanic[&#39;sex&#39;])

titanic.head(2)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/ea382a02-4e22-4aef-bbee-c0874381e7bf/image.png" alt=""></li>
</ul>
</br>

<h2 id="⭐-2-결측치-버리기">⭐ 2) 결측치 버리기</h2>
<ul>
<li>위와 같이 컬럼 마다 데이터수 가 다름</li>
<li>ML을 위해 결측치는 버리고 가기로!<pre><code>titanic = titanic[titanic[&#39;age&#39;].notnull()] # 1046
titanic = titanic[titanic[&#39;fare&#39;].notnull()] # 1308</code></pre></li>
</ul>
</br>

<h2 id="⭐-3-훈련테스트-나누기">⭐ 3) 훈련/테스트 나누기</h2>
<pre><code>titanic.columns</code></pre><p>Index([&#39;pclass&#39;, &#39;survived&#39;, &#39;name&#39;, &#39;sex&#39;, &#39;age&#39;, &#39;sibsp&#39;, &#39;parch&#39;, &#39;ticket&#39;,
       &#39;fare&#39;, &#39;cabin&#39;, &#39;embarked&#39;, &#39;boat&#39;, &#39;body&#39;, &#39;home.dest&#39;, &#39;age_cat&#39;,
       &#39;title&#39;, &#39;gender&#39;],
      dtype=&#39;object&#39;)</p>
<pre><code>from sklearn.model_selection import train_test_split

X = titanic[[&#39;pclass&#39;,&#39;age&#39;, &#39;sibsp&#39;, &#39;parch&#39;,&#39;fare&#39;,&#39;gender&#39;]]
y = titanic[&#39;survived&#39;]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)</code></pre></br>

<h2 id="⭐-4-decision-tree">⭐ 4) Decision tree</h2>
<pre><code># DecisionTreeClassifier 모듈
from sklearn.tree import DecisionTreeClassifier
# 성능 확인 모듈
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(max_depth=4, random_state=13)

# 훈련(특성, 라벨)
dt.fit(X_train, y_train)

# 훈련을 완료한 dt에 예측(성능) 명령
pred = dt.predict(X_test)

# (참값, 예측값)
accuracy_score(y_test, pred)</code></pre><p>0.7655502392344498</p>
</br>

<h2 id="⭐-5-두-주인공의-생존확률">⭐ 5) 두 주인공의 생존확률?</h2>
<pre><code>import numpy as np

# Jack
# 3등석, 18살, 부모형제 없음, 자녀없음, 탑승료, 남성
Deca = np.array([[3, 18, 0, 0, 5, 1]])
print(&#39;Deca :&#39;, dt.predict_proba(Deca))
print(&#39;Deca :&#39;, dt.predict_proba(Deca)[0, 1]) # 위 값의 첫,두번째 값만 출력해줘

# Rose
# 1등석, 16살, 
Wins = np.array([[1, 16, 1, 1, 100, 0]])
print(&#39;Rose :&#39;, dt.predict_proba(Wins))
print(&#39;Rose :&#39;, dt.predict_proba(Wins)[0, 1])</code></pre><p>Deca : [[0.83271375 0.16728625]]
Deca : 0.16728624535315986
Rose : [[0. 1.]]
Rose : 1.0</p>
]]></description>
        </item>
        <item>
            <title><![CDATA[ML - 1. Iris의 품종 분류]]></title>
            <link>https://velog.io/@jaam_mini/ML-1.-Iris%EC%9D%98-%ED%92%88%EC%A2%85-%EB%B6%84%EB%A5%98</link>
            <guid>https://velog.io/@jaam_mini/ML-1.-Iris%EC%9D%98-%ED%92%88%EC%A2%85-%EB%B6%84%EB%A5%98</guid>
            <pubDate>Mon, 15 Jan 2024 11:41:28 GMT</pubDate>
            <description><![CDATA[<h1 id="iris-데이터-불러오기">iris 데이터 불러오기</h1>
<hr>
<ul>
<li>모듈 insatall
<img src="https://velog.velcdn.com/images/jaam_mini/post/a82f81f7-711a-4cb1-bb42-571e6e02c7a6/image.png" alt=""></li>
</ul>
<p>(1) 데이터 불러오기</p>
<ul>
<li>sklearn 에 올라와 있는 데이터 이용<pre><code>from sklearn.datasets import load_iris
iris = load_iris()
iris</code></pre></li>
</ul>
<p>(2) 데이터 타입 확인</p>
<ul>
<li><p>각각의 데이터 확인</p>
<pre><code>iris.keys()</code></pre><pre><code># 줄 바꿈을 위해 print 사용
# &#39;DESCR&#39; 칼럼은 데이터의 설명이 들어있음
print(iris[&#39;DESCR&#39;])</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/71c3ccc4-b0d2-4679-b5d3-dbfa0692dc2c/image.png" alt=""></li>
</ul>
<pre><code>print(iris[&#39;target&#39;])
len(iris[&#39;target&#39;])</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/8f9cbd9c-f413-4e8a-a8c7-79f8b6e89c34/image.png" alt=""></li>
</ul>
<pre><code># 위의 데이터와 함께 보면, 0번이 setosa, 1번이 versicolor, 2번이 virginica

print(iris[&#39;target_names&#39;])</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/33473466-6a37-491e-81e1-f175c30021d3/image.png" alt=""></li>
</ul>
</li>
</ul>
<h2 id="3-dataframe-만들기">(3) DataFrame 만들기</h2>
<pre><code>  import pandas as pd

  iris_pd = pd.DataFrame(iris.data, columns=iris.feature_names)
  iris_pd.head()</code></pre><p>  <img src="https://velog.velcdn.com/images/jaam_mini/post/64f57603-0e62-4bf9-b6d6-17c5f4e80e1a/image.png" alt=""></p>
<h2 id="4-품종-정보-column에-포함¶">(4) 품종 정보 column에 포함¶</h2>
<pre><code>  iris_pd[&#39;species&#39;] = iris.target
  iris_pd.head()</code></pre><p>  <img src="https://velog.velcdn.com/images/jaam_mini/post/e446305a-97e7-4215-bcfa-4895c945b6d2/image.png" alt=""></p>
<p>(5) 그래프를 통해 데이터 확인</p>
<pre><code>import matplotlib.pyplot as plt
import seaborn as sns</code></pre><ul>
<li><p>sepal length 와 species 의 관계</p>
<ul>
<li>관계가 나쁨?</li>
<li>3개가 모두 겹쳐 있어, 구분이 어려움
<img src="https://velog.velcdn.com/images/jaam_mini/post/22e595a3-46ed-4ebf-b4ba-53f6c8476a9d/image.png" alt=""></li>
</ul>
</li>
<li><p>sepal width (cm) 와 species 관계</p>
<ul>
<li>3개가 겹쳐 있어 구분이 어려움
<img src="https://velog.velcdn.com/images/jaam_mini/post/e1ce853d-3a14-48d6-97c2-3d7a7c15412c/image.png" alt=""></li>
</ul>
</li>
<li><p>petal length (cm) 와 species</p>
<ul>
<li>분류됨
<img src="https://velog.velcdn.com/images/jaam_mini/post/1323af1d-6547-4604-b50b-9966757a8be8/image.png" alt=""></li>
</ul>
</li>
<li><p>pairplot </p>
<ul>
<li>구분되는 그래프가 있는지 확인하기<pre><code>sns.pairplot(iris_pd, hue=&#39;species&#39;)</code></pre><img src="https://velog.velcdn.com/images/jaam_mini/post/20cfd9f9-3572-4d1b-aebc-7eddaa6e5e9f/image.png" alt=""></li>
</ul>
</li>
</ul>
<p></br></br></br></br></p>
<h1 id="1-decision-tree">1. Decision Tree</h1>
<hr>
<p>나머지 인덱스1,2 를 어떻게 구분할 것인가?</p>
<p>📌 df[df[&#39;column(class)&#39;]] : df데이터를 슬라이싱 : 데이터를 선택해라
📌 class가 0은 다 빼라 : 컬럼 0을 제외한 데이터를 보여줘</p>
<pre><code>iris_pd[iris_pd[&#39;species&#39;] != 0]</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/4c489f60-19d8-47dc-a2d0-a59cbfbd6b14/image.png" alt=""></li>
</ul>
<pre><code>iris_12 = iris_pd[iris_pd[&#39;species&#39;] != 0]
iris_12.info()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/09d12021-5895-450e-aac9-47279a3ade50/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h3 id="1-split-criterion-분할기준">(1) Split Criterion (분할기준)</h3>
<hr>
<p>📌직선 하나로 두개를 나눠야 함
📌어디 경계선이 최고 일까? 를 찾아야 함</p>
<pre><code>plt.figure(figsize=(4,2))
sns.scatterplot(x=&#39;petal length (cm)&#39;, y=&#39;petal width (cm)&#39;, data=iris_12, hue=&#39;species&#39;, palette=&#39;Set2&#39;);</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/a3f13e13-3a17-407f-bc48-8c7ba10beb3f/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h3 id="2-엔트로피">(2) 엔트로피</h3>
<hr>
<p>📌 -pi log2 pi</p>
<ul>
<li>p는 해당 데이터가 해당 클래스에 속할 확률이고 위 식을 그려보면 다음과 같다</li>
<li>어떤 확률 분포로 일어나는 사건을 표현하는 데 필요한 정보의 양이며 이 값이 커질수록 확률 분포의 불확실성이 커지며 결과에 대한 예측이 어려워짐</li>
</ul>
<p>📌 엔트로피 = (-pi log2 pi)의 모든 합</p>
<ul>
<li>무질서할수록 엔트로피 값은 높다 (불확실 성이 높을 수록)</li>
<li>엔트로피 값이 내려갈수록, 질서가 잡혀가는 것!!!</li>
</ul>
<pre><code>import numpy as np

p = np.arange(0.001, 1, 0.001)
plt.grid()
plt.title(&#39;$-p \log_{2}{p}$&#39;)
plt.plot(p, -p*np.log2(p));</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/34697108-0a8f-4030-bb6f-0593abf86823/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h3 id="2-1-예제">(2)-1 예제</h3>
<hr>
<p>📌 기본</p>
<ul>
<li>파란공(10개) 빨간공(6개)<pre><code>-(10/16)*np.log2(10/16) - 6/16*np.log2(6/16)</code></pre>0.954434002924965</li>
</ul>
<p>📌 정 중앙에 선을 하나 만들어서 나눔</p>
<ul>
<li><p>(왼쪽) 파란공(1) 빨간공(7), (오른쪽) 파란공(5) 빨간공(3)</p>
<pre><code>0.5*(-(7/8)*np.log2(7/8) -1/8*np.log2(1/8)) + \
0.5*(-(3/8)*np.log2(3/8) - 5/8*np.log2(5/8))</code></pre><p>0.7489992230622807
✅ 
엔트로피가 내려갔으므로, 분할 하는 것이 좋음!!!!</p>
</li>
</ul>
<p></br></br></p>
<h3 id="3-지니계수">(3) 지니계수</h3>
<hr>
<ul>
<li>Gini index 혹은 불순도율</li>
<li>엔트로피의 계산량이 많아서 비슷한 개념이면서 보다 게산량이 적은 지니계수를 사용하는 경우가 많다.</li>
</ul>
<p></br></br></p>
<h3 id="3-1-예제">(3)-1 예제</h3>
<hr>
<p>📌 기본</p>
<ul>
<li>파란공(10개) 빨간공(6개)<pre><code># 1 - 파란색의 확률 - 빨간색의 확률
1 - (6/16)**2 - (10/16)**2</code></pre>0.46875</li>
</ul>
<p>📌 정 중앙에 선을 하나 만들어서 나눔</p>
<ul>
<li><p>(왼쪽) 파란공(1) 빨간공(7), (오른쪽) 파란공(5) 빨간공(3)</p>
<pre><code>0.5*(1 - (7/8)**2 - (1/8)**2) + 0.5*(1 - (3/8)**2 - (5/8)**2)</code></pre><p>0.34375
✅ 
지니계수 값이 내려갔으므로, 분할 하는 것이 좋음!!!!</p>
</li>
</ul>
<p></br></br>
</br></br></p>
<h1 id="2-scikit-learn">2. Scikit Learn</h1>
<hr>
<p>📌 모듈</p>
<pre><code>from sklearn.tree import DecisionTreeClassifier

iris_tree = DecisionTreeClassifier()</code></pre>  </br>

<p>📌 data 확인</p>
<ul>
<li><p>150개의 행, 4개의 열</p>
<pre><code>iris.data.shape</code></pre><p>(150, 4)</p>
</br>
</li>
<li><p>첫번째 행 불러오기. 리스트 값으로 반환됨</p>
<pre><code>iris.data[0]</code></pre><p>array([5.1, 3.5, 1.4, 0.2])</p>
</br>
</li>
<li><p>컬럼 확인</p>
<pre><code>iris_pd.head(1)</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/84a9dd9e-2453-4e0c-a1fb-169d7524a851/image.png" alt=""></p>
</br>
</li>
<li><p>[모든행, petal length (cm)    petal width (cm) 만 따오고 싶음]</p>
<pre><code>iris.data[:, 2:]</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/73cf55e9-73c9-4786-92d3-9068064cdcf5/image.png" alt=""></p>
</br>

</li>
</ul>
<h3 id="1-학습">(1) 학습</h3>
<hr>
<p>📌학습 시킬 모델명 : iris_tree
📌fit 명령을 써서 정답과 함께 학습을 완료 시키고 싶음</p>
<ul>
<li>fit : 학습해라 (데이터, 정답)<pre><code>iris_tree.fit(iris.data[:, 2:], iris.target)</code></pre><img src="https://velog.velcdn.com/images/jaam_mini/post/e52fc7bc-7ac2-4d6f-9f0f-23d526dfa067/image.png" alt=""></li>
</ul>
</br>

<h3 id="2-성능-확인">(2) 성능 확인</h3>
<hr>
<p>📌 Accuracy 확인</p>
<ul>
<li>y_pred_tr : 예측 결과 변수</li>
<li>iris.target : 참 값 (정답)</li>
</ul>
<p>📌 99.3 % 의 정확성 도출</p>
<ul>
<li><p>accuracy_score 모듈</p>
<pre><code>from sklearn.metrics import accuracy_score</code></pre></li>
<li><p>학습이 완료된 iris_tree에게 예측(predict)을 시킴 (원하는 데이터(값))</p>
<pre><code>y_pred_tr = iris_tree.predict(iris.data[:, 2:])</code></pre></li>
<li><p>accuracy_score 함수 사용</p>
</li>
<li><p>(정답 알려주고, 예측한 결과도 알려줌)</p>
<pre><code>accuracy_score(iris.target, y_pred_tr)</code></pre></li>
</ul>
<p>0.9933333333333333</p>
<p></br></br></br></br></p>
<h1 id="3-과적합">3. 과적합</h1>
<hr>
<h3 id="1-지도학습">(1) 지도학습</h3>
<p><img src="https://velog.velcdn.com/images/jaam_mini/post/39b51f20-ab30-4c3f-b750-b514e634096e/image.png" alt=""></p>
<ul>
<li>Label(Y, 정답)을 붙여 학습 시킴</li>
<li>새로운 데이터를 학습시킨 것에 넣음</li>
<li>예측 결과를 뽑아줌</li>
</ul>
<p></br></br></p>
<h3 id="2-plot-tree">(2) plot tree</h3>
<hr>
<ul>
<li><p>plot_tree 모듈</p>
<pre><code>from sklearn.tree import plot_tree
</code></pre></li>
<li><p>iris_tree 가 어떻게 생겼는지 보여줘~</p>
<pre><code>plt.figure(figsize=(10,7))
plot_tree(iris_tree);</code></pre><p><img src="https://velog.velcdn.com/images/jaam_mini/post/8c9b8926-f701-43b1-8434-2bb888200a5d/image.png" alt=""></p>
</li>
</ul>
<p></br></br></p>
<h3 id="3-mlxtendplotting">(3) mlxtend.plotting</h3>
<hr>
<ul>
<li>데이터의 경계선을 그려주는 함수</li>
<li>쓸수 있는 상황이 많진 않음</li>
</ul>
<p>📌 mlxtend 설치</p>
<pre><code># !pip install mlxtend</code></pre></br>

<p>📌 plot_tree 데이터 확인
iris의 품종을 분류하는 결정나무 모델이 어떻게 데이터를 분류했는지 확인해보자</p>
<ul>
<li><p>mlxtend 모듈</p>
<pre><code>from mlxtend.plotting import plot_decision_regions</code></pre></li>
<li><p>X 는 대문자로 써야함....?</p>
</li>
<li><p>clf모델 : iris_tree에 학습되어 저장되어 있음</p>
</li>
<li><p>legend : 범례</p>
<pre><code>plt.figure(figsize=(14,8))
plot_decision_regions(X=iris.data[:, 2:], y=iris.target, clf=iris_tree, legend=2)
plt.show()</code></pre></li>
<li><p><img src="https://velog.velcdn.com/images/jaam_mini/post/e0616a6b-9a52-4c10-a4dd-12a9d79cd456/image.png" alt=""></p>
<ul>
<li>저 경계면은 올바른 걸까?</li>
<li>저 결과는 내가 가진 데이터를 벗어나서 일반화할 수 있는 걸까?</li>
<li>어차피 얻은(혹은 구한) 데이터는 유한하고 내가 얻은 데이터를 이용해서 일반화를 추구하게 된다.</li>
<li>이때 복잡한 경계면은 모델의 성능을 결국 나쁘게 만든다.</li>
</ul>
</li>
</ul>
<p></br></br></br></br></p>
<h1 id="4-데이터-분리">4. 데이터 분리</h1>
<h2 id="1-데이터-훈련테스트로-분리">1. 데이터 훈련/테스트로 분리</h2>
<pre><code>from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()</code></pre><pre><code># train_test_split 함수(나눠주는) 모듈
from sklearn.model_selection import train_test_split

# features : iris.data[:, 2:] 변수
# labels : iris.target (정답) 변수
features = iris.data[:, 2:]
labels = iris.target

# 4개의 변수를 반환 받을 것임 : train_test_split 라는 변수로
# 지정해야 할 것들 : features, labels, test_size (훈련용80%, 랜덤용20%), random_state
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=13
)</code></pre><pre><code>X_train.shape, X_test.shape</code></pre><p>((120, 2), (30, 2))</p>
<p></br></br></p>
<h2 id="2-npunique-검사">2. np.unique 검사</h2>
<p>📌 (필수..! 잘 분리되었는지 확인)</p>
<ul>
<li>((120, 2), (30, 2))에 iris 3개 종이 각각 몇개 들어갔을까?</li>
<li>꼭 확인하는 작업이 필요함</li>
</ul>
<pre><code># 모듈
import numpy as np

np.unique(y_test, return_counts=True)</code></pre><p>(array([0, 1, 2]), array([ 9,  8, 13], dtype=int64))</p>
<p></br></br></p>
<h2 id="3-stratify">3. stratify</h2>
<ul>
<li>문제가 각 클래스(setosa, versicolor, verginica) 별로 동일 비율이 아니다</li>
<li>이럴때, class의 옵션을 맞춰주는 것이 좋음</li>
<li>내가 맞춰야 될 특성이 있는 라벨로 넣어줘야 함</li>
</ul>
<pre><code>from sklearn.model_selection import train_test_split

features = iris.data[:, 2:]
labels = iris.target

# 📌 stratify=labels 추가 : class 별 분포를 맞춰 줌
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=13, stratify=labels
)</code></pre><pre><code>import numpy as np

np.unique(y_test, return_counts=True)</code></pre><p>(array([0, 1, 2]), array([10, 10, 10], dtype=int64))</p>
<p></br></br></p>
<h2 id="4-결정나무모델decisiontreeclassifier">4. 결정나무모델(DecisionTreeClassifier)</h2>
<h3 id="1-max_depth--과적합을-위해-제한해야-함">(1) max_depth : 과적합을 위해 제한해야 함</h3>
<ul>
<li>깊을 수 록 내가 준 데이터의 성능이 100%에 다가감</li>
<li>성능이 높은 것이 꼭 좋지 않음</li>
<li>제한 시킬 필요가 있음</li>
</ul>
<pre><code># DecisionTreeClassifier 모듈
from sklearn.tree import DecisionTreeClassifier

# max_depth 설정
iris_tree = DecisionTreeClassifier(max_depth=2, random_state=13)

# fit 훈련 : (데이터, 정답)
iris_tree.fit(X_train, y_train)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/b8dcdaa7-725b-4f16-a947-3026c88e6edf/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h3 id="2-plot_tree">(2) plot_tree</h3>
<ul>
<li>iris_tree 가 어떻게 생겼는지 보여줘</li>
</ul>
<pre><code># plot_tree 모듈
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# iris_tree 가 어떻게 생겼는지 보여줘~
plt.figure(figsize=(5,5))
plot_tree(iris_tree);</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/9c708d37-ac5c-4009-bfd0-cfaf0de8767a/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h3 id="3-accuracy-확인">(3) Accuracy 확인</h3>
<ul>
<li>성능 확인</li>
</ul>
<pre><code># accuracy_score 모듈
from sklearn.metrics import accuracy_score

# 학습이 완료된 iris_tree에게 예측(predict)을 시킴 (훈련된값)
y_pred_tr = iris_tree.predict(X_train)

# accuracy_score 함수 사용
# (정답 알려주고, 예측한 결과도 알려줌)
accuracy_score(y_train, y_pred_tr)</code></pre><p>0.95</p>
<p></br></br></p>
<h2 id="5-x_train의-결정경계-확인">5. X_train의 결정경계 확인</h2>
<pre><code># plot_decision_regions 모듈
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt

plt.figure(figsize=(12,5))
plot_decision_regions(X=X_train, y=y_train, clf=iris_tree, legend=2)
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/012da1ec-7b1a-470f-be36-7499f2a1ae86/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h2 id="6-test-data위-쿼리s에-accuracy정확도-확인">6. test data(위 쿼리s)에 accuracy(정확도 확인)</h2>
<ul>
<li>test 결과 96.6% 도출</li>
</ul>
<pre><code>y_pred_test = iris_tree.predict(X_test)
accuracy_score(y_test, y_pred_test)</code></pre><p>0.9666666666666667</p>
<p></br></br></p>
<h2 id="7-잔기술">7. 잔기술</h2>
<ul>
<li>150개 데이터 전체를 train과 test를 분리해 결정경계 까지 넣고자 함</li>
</ul>
<pre><code>scatter_highlight_kwargs = {&#39;s&#39;:150, &#39;label&#39;:&#39;Test data&#39;, &#39;alpha&#39;:0.9}
scatter_kwargs = {&#39;s&#39;:120, &#39;edgecolor&#39;:None, &#39;alpha&#39;:0.7}

plt.figure(figsize=(12,8))
plot_decision_regions(X=features, y=labels,
                      X_highlight=X_test,
                      clf=iris_tree,
                      legend=2,
                      scatter_highlight_kwargs=scatter_highlight_kwargs,
                      scatter_kwargs=scatter_kwargs,
                      contourf_kwargs={&#39;alpha&#39;:0.2})
plt.show()</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/526f67c2-0c85-4f96-81bb-94dbd46c09be/image.png" alt=""></li>
</ul>
<p></br></br></p>
<h2 id="8-모델-사용-방법">8. 모델 사용 방법</h2>
<ul>
<li>새로운 데이터를 가지고 예측 결과를 도출</li>
</ul>
<p></br></br></p>
<h3 id="1-새-데이터로-결과-도출해보기">(1) 새 데이터로 결과 도출해보기</h3>
<ul>
<li><p>features 4개를 새로 지정</p>
<pre><code>features = iris.data
labels = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, stratify=labels, random_state=13
)

iris_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
iris_tree.fit(X_train, y_train)</code></pre><ul>
<li><img src="https://velog.velcdn.com/images/jaam_mini/post/fdac2a15-9b66-4429-82e2-8158e573bda7/image.png" alt=""></li>
</ul>
</li>
<li><p>주운 꽃의 데이터 : 4.3,2.,1.2,1.0</p>
<pre><code># 값으로 정답 도출
test_data = [[4.3,2.,1.2,1.0]]
iris_tree.predict(test_data)</code></pre><p>array([1])</p>
<pre><code># 문자로 정답 도출
iris.target_names[iris_tree.predict(test_data)]</code></pre><p>array([&#39;versicolor&#39;], dtype=&#39;&lt;U10&#39;)</p>
<pre><code># predict_proba : 각 데이터일 확률 확인
iris_tree.predict_proba(test_data)</code></pre><p>array([[0.        , 0.97222222, 0.02777778]])</p>
<pre><code># list 형태로, shape을 보고 싶으면 np.array()로 감사줄 것

test_data = np.array([[4.3,2.,1.2,1.0]])
test_data.shape</code></pre><p>(1, 4)</p>
</li>
</ul>
<p></br></br></p>
<h3 id="2-zip">(2) zip</h3>
<ul>
<li><p>zip 모델</p>
<pre><code>iris_clf_model = dict(zip(iris.feature_names, iris_tree.feature_importances_))
iris_clf_model</code></pre><p>{&#39;sepal length (cm)&#39;: 0.0,
 &#39;sepal width (cm)&#39;: 0.0,
 &#39;petal length (cm)&#39;: 0.421897810218978,
 &#39;petal width (cm)&#39;: 0.578102189781022}</p>
</li>
<li><p>리스트를 튜플로 만들기</p>
<pre><code>list1 = [&#39;a&#39;,&#39;b&#39;,&#39;c&#39;]
list2 = [1,2,3]</code></pre><pre><code>pairs = [pair for pair in zip(list1, list2)]
pairs</code></pre><p>[(&#39;a&#39;, 1), (&#39;b&#39;, 2), (&#39;c&#39;, 3)]</p>
</li>
<li><p>튜플을 dict 으로</p>
<pre><code>dict(pairs)</code></pre><p>{&#39;a&#39;: 1, &#39;b&#39;: 2, &#39;c&#39;: 3}</p>
</li>
</ul>
]]></description>
        </item>
    </channel>
</rss>