부스트캠프 AI Tech 4기

[NLP_KLUE] 2. 한국어 전처리

자연어처리의 단계

Task 설계
필요 데이터 수집
통계학적 분석
- Token 개수 -> 아웃라이어 제거
- 빈도 확인 -> 사전(dictionary) 정의
전처리
- 개행문자 제거
- 특수문자 제거
- 공백 제거
- 중복 표현 제거
- 이메일, 링크 제거
- 제목 제거
- 불용어 제거
- 조사 제거
- 띄어쓰기, 문장분리 보정
Tagging(라벨링)
Tokenizing - 자연어를 어떤 단위로 살펴볼 것인가
- 어절 tokenizing
- 형태소 tokenizing
- WordPiece tokenzing
모델 설계
모델 구현
성능 평가

한국어 전처리

▮ 전처리를 위한 코퍼스 수집

url 정보만 입력해주면 텍스트를 추출해주는 라이브러리 : newspaper

▮ <HTML> 태그 전처리

def remove_html(texts):
    """
    HTML 태그 제거
    """
    preprcessed_text = []
    for text in texts:
        text = re.sub(r"<[^>]+>\s+(?=<)|<[^>]+>", "", text).strip()
        if text:
            preprcessed_text.append(text)
    return preprcessed_text

▮ 문장 분리

import kss

sents = []

for sent in context:
    sent = sent.strip()
    if sent:
        splited_sent = kss.split_sentences(sent)
        sents.extend(splited_sent)

▮ Normalizing

def remove_email(texts):
    """
    이메일 제거
    """
    preprocessed_text = []
    for text in texts:
        text = re.sub(r"[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "", text).strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

def remove_hashtag(texts):
    """
    해쉬태그(#) 제거
    """
    preprocessed_text = []
    for text in texts:
        text = re.sub(r"#\S+", "", text).strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

def remove_user_mention(texts):
    """
    유저에 대한 멘션(@) 태그 제거
    """
    preprocessed_text = []
    for text in texts:
        text = re.sub(r"@\w+", "", text).strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

def remove_url(texts):
    """
    URL을 제거
    """
    preprocessed_text = []
    for text in texts:
        text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text).strip()
        text = re.sub(r"pic\.(\w+\.)+\S*", "", text).strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

def remove_bad_char(texts):
    """
    문제를 일으킬 수 있는 문자들을 제거
    """
    bad_chars = {"\u200b": "", "…": " ... ", "\ufeff": ""}
    preprcessed_text = []
    for text in texts:
        for bad_char in bad_chars:
            text = text.replace(bad_char, bad_chars[bad_char])
        text = re.sub(r"[\+á?\xc3\xa1]", "", text)
        if text:
            preprcessed_text.append(text)
    return preprcessed_text

def remove_press(texts):
    """
    언론 정보 제거
    ``홍길동 기자 (연합뉴스)`` -> ````
    ``(이스탄불=연합뉴스) 하채림 특파원 -> ````
    """
    re_patterns = [
        r"\([^(]*?(뉴스|경제|일보|미디어|데일리|한겨례|타임즈|위키트리)\)",
        r"[가-힣]{0,4} (기자|선임기자|수습기자|특파원|객원기자|논설고문|통신원|연구소장) ",  # 이름 + 기자
        r"[가-힣]{1,}(뉴스|경제|일보|미디어|데일리|한겨례|타임|위키트리)",  # (... 연합뉴스) ..
        r"\(\s+\)",  # (  )
        r"\(=\s+\)",  # (=  )
        r"\(\s+=\)",  # (  =)
    ]

    preprocessed_text = []
    for text in texts:
        for re_pattern in re_patterns:
            text = re.sub(re_pattern, "", text).strip()
        if text:
            preprocessed_text.append(text)    
    return preprocessed_text

def remove_copyright(texts):
    """
    뉴스 내 포함된 저작권 관련 텍스트를 제거
    ``(사진=저작권자(c) 연합뉴스, 무단 전재-재배포 금지)`` -> ``(사진= 연합뉴스, 무단 전재-재배포 금지)``
    """
    re_patterns = [
        r"\<저작권자(\(c\)|ⓒ|©|\(Copyright\)|(\(c\))|(\(C\))).+?\>",
        r"저작권자\(c\)|ⓒ|©|(Copyright)|(\(c\))|(\(C\))"
    ]
    preprocessed_text = []
    for text in texts:
        for re_pattern in re_patterns:
            text = re.sub(re_pattern, "", text).strip()
        if text:
            preprocessed_text.append(text)    
    return preprocessed_text

def remove_photo_info(texts):
    """
    뉴스 내 포함된 이미지에 대한 label 제거
    ``(사진= 연합뉴스, 무단 전재-재배포 금지)`` -> ````
    ``(출처=청주시)`` -> ````
    """
    preprocessed_text = []
    for text in texts:
        text = re.sub(r"\(출처 ?= ?.+\) |\(사진 ?= ?.+\) |\(자료 ?= ?.+\)| \(자료사진\) |사진=.+기자 ", "", text).strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

def remove_useless_breacket(texts):
    """
    괄호 내부에 의미가 없는 정보를 제거
    아무런 정보를 포함하고 있지 않다면, 괄호를 통채로 제거
    ``수학(,)`` -> ``수학``
    ``수학(數學,) -> ``수학(數學)``
    """
    bracket_pattern = re.compile(r"\((.*?)\)")
    preprocessed_text = []
    for text in texts:
        modi_text = ""
        text = text.replace("()", "")  # 수학() -> 수학
        brackets = bracket_pattern.search(text)
        if not brackets:
            if text:
                preprocessed_text.append(text)
                continue
        replace_brackets = {}
        # key: 원본 문장에서 고쳐야하는 index, value: 고쳐져야 하는 값
        # e.g. {'2,8': '(數學)','34,37': ''}
        while brackets:
            index_key = str(brackets.start()) + "," + str(brackets.end())
            bracket = text[brackets.start() + 1 : brackets.end() - 1]
            infos = bracket.split(",")
            modi_infos = []
            for info in infos:
                info = info.strip()
                if len(info) > 0:
                    modi_infos.append(info)
            if len(modi_infos) > 0:
                replace_brackets[index_key] = "(" + ", ".join(modi_infos) + ")"
            else:
                replace_brackets[index_key] = ""
            brackets = bracket_pattern.search(text, brackets.start() + 1)
        end_index = 0
        for index_key in replace_brackets.keys():
            start_index = int(index_key.split(",")[0])
            modi_text += text[end_index:start_index]
            modi_text += replace_brackets[index_key]
            end_index = int(index_key.split(",")[1])
        modi_text += text[end_index:]
        modi_text = modi_text.strip()
        if modi_text:
            preprocessed_text.append(modi_text)
    return preprocessed_text

from soynlp.normalizer import *

def remove_repeat_char(texts):
    """
    반복되는 문자 normalizing
    ``ㅋㅋㅋㅋㅋㅋㅋ`` → ``ㅋㅋ``
    """
    preprocessed_text = []
    for text in texts:
        text = repeat_normalize(text, num_repeats=2).strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

def clean_punc(texts):
    """
    기호들을 일반화
    """
    punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

    preprocessed_text = []
    for text in texts:
        for p in punct_mapping:
            text = text.replace(p, punct_mapping[p])
        text = text.strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

def remove_repeated_spacing(texts):
    """
    두 개 이상의 연속된 공백을 하나로 치환
    ``오늘은    날씨가   좋다.`` -> ``오늘은 날씨가 좋다.``
    """
    preprocessed_text = []
    for text in texts:
        text = re.sub(r"\s+", " ", text).strip()
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

from collections import OrderedDict
def remove_dup_sent(texts):
    """
    중복된 문장 제거
    """
    texts = list(OrderedDict.fromkeys(texts))
    return texts

# !pip install git+https://github.com/haven-jeon/PyKoSpacing.git

from pykospacing import Spacing
spacing = Spacing()

def spacing_sent(texts):
    """
    띄어쓰기 보정
    """
    preprocessed_text = []
    for text in texts:
        text = spacing(text)
        if text:
            preprocessed_text.append(text)
    return preprocessed_text

# !pip install git+https://github.com/ssut/py-hanspell.git

from hanspell import spell_checker

def spell_check_sent(texts):
    """
    맞춤법 보정
    """
    preprocessed_text = []
    for text in texts:
        try:
            spelled_sent = spell_checker.check(text)
            checked_sent = spelled_sent.checked 
            if checked_sent:
                preprocessed_text.append(checked_sent)
        except:
            preprocessed_text.append(text)
    return preprocessed_text

형태소 분석 기반 필터링

# !pip install konlpy
# !bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

from konlpy.tag import Mecab
mecab = Mecab()

def morph_filter(texts):
    """
    명사(NN), 동사(V), 형용사(J)의 포함 여부에 따라 문장 필터링
    """
    NN_TAGS = ["NNG", "NNP", "NNB", "NP"]
    V_TAGS = ["VV", "VA", "VX", "VCP", "VCN", "XSN", "XSA", "XSV"]
    J_TAGS = ["JKS", "J", "JO", "JK", "JKC", "JKG", "JKB", "JKV", "JKQ", "JX", "JC", "JKI", "JKO", "JKM", "ETM"]

    preprocessed_text = []
    for text in texts:
        morphs = mecab.pos(text, join=False)

        nn_flag = False
        v_flag = False
        j_flag = False
        for morph in morphs:
            pos_tags = morph[1].split("+")
            for pos_tag in pos_tags:
                if not nn_flag and pos_tag in NN_TAGS:
                    nn_flag = True
                if not v_flag and pos_tag in V_TAGS:
                    v_flag = True
                if not j_flag and pos_tag in J_TAGS:
                    j_flag = True
            if nn_flag and v_flag and j_flag:
                preprocessed_text.append(text)
                break
    return preprocessed_text

def excluded_word_filter(excluded_words, texts):
    """
    특정 단어를 포함하는 문장 필터링
    """
    preprocessed_text = []
    for text in texts:
        include_flag = False
        for word in excluded_words:
            if word in text:
                include_flag = True
                break
        if not include_flag:
            preprocessed_text.append(text)
    return preprocessed_text

def remove_stopwords(sents):
    #  큰 의미가 없는 불용어 정의
    stopwords = ['소취요', '-', '조드윅', '포스터', '앓는', '서린']
    preprocessed_text = []
    for sent in sents:
        sent = [w for w in sent.split(' ') if w not in stopwords]# 불용어 제거
        preprocessed_text.append(' '.join(sent))
    return preprocessed_text

def min_max_filter(min_len, max_len, texts):
    """
    문장을 최대, 최소 길이로 필터링
    """
    preprocessed_text = []
    for text in texts:
        if min_len < len(text) and len(text) < max_len:
            preprocessed_text.append(text)
    return preprocessed_text

부스트캠프 AI Tech 교육 자료를 참고하였습니다.

728x90

'부스트캠프 AI Tech 4기' 카테고리의 다른 글

[NLP_KLUE] 4. BERT / Huggingface Tokenizer (2)	2022.11.15
[NLP_KLUE] 3. 한국어 Tokenizing (1)	2022.11.14
[NLP_KLUE] 1. 자연어 단어 임베딩 / 딥러닝 기반 자연어처리와 언어모델 (0)	2022.11.14
[WEEK08] CI (0)	2022.11.14
[WEEK08] 회고 (0)	2022.11.14

Contents

새소식

[NLP_KLUE] 2. 한국어 전처리

자연어처리의 단계

한국어 전처리

'부스트캠프 AI Tech 4기' 카테고리의 다른 글

당신이 좋아할만한 콘텐츠

티스토리툴바