1 minute read

단어리스트의 단어를 포함한 문장을 NLTK 코퍼스에서 찾아 단어리스트의 예문 채우기

This is a python code in Colab to find sentences that include the words in the vocabulary list.

Mount drive in Colab
# Mount my Google Drive (storage)
from google.colab import drive
drive.mount('/content/gdrive')

# data dir
import os
data_dir = '/content/gdrive/MyDrive/YOUR FOLDER NAME'  # Your data directory in Colab 
os.listdir(data_dir)
Input Word list
import pandas as pd
import numpy as np

voca = pd.read_excel(os.path.join(data_dir, "input_words.xlsx"))
voca.head()
Prepare Corpora
# Corpora in NLTK
import nltk
from nltk import sent_tokenize, word_tokenize

nltk.download('inaugural')
nltk.download('punkt')
nltk.download('gutenberg')
nltk.download('brown')
nltk.download('genesis')

from nltk.corpus import inaugural
from nltk.corpus import gutenberg
from nltk.corpus import brown
from nltk.corpus import genesis
# Combine all corpora
corpus = list()
corpus.extend(nltk.corpus.inaugural.sents())
corpus.extend(nltk.corpus.gutenberg.sents())
corpus.extend(nltk.corpus.brown.sents())
corpus.extend(nltk.corpus.genesis.sents())

corpus_sent = [" ".join(list_of_words) for list_of_words in corpus]

Find sentence indices
# "lines=" indicate the number of sentences to be extracted.

def sent_find(word, corpus, lines = 2):

    results = []
    for idx, s in enumerate(corpus):
        if s.find(word) != -1:  # If a substring doesn't exist inside the string, it returns -1.
            if lines <= 0 : 
              break
            results.append(idx)
            lines -= 1

    try: 
      return results    
    except ValueError:
      pass 
Replace indices with sentences
### function 
def sent_extract(sent_nos, corpus):
  sents = str()
  no_len = len(sent_nos)
  for no in sent_nos:
    sents += corpus[no]
    if no_len > 1:
      sents += '\n'   # for a new line 
    no_len -= 1
  return sents
Run the function
voca["Sent_No"] = voca["Word"].apply(lambda x: sent_find(x, corpus_sent, 2)) # two sentences
voca["Sentences"] = voca["Sent_No"].apply(lambda x: sent_extract(x, corpus_sent))

voca_book = voca[["No", "Word", "Meaning", "Sentences"]]
voca_book.to_excel(os.path.join(data_dir, 'output_sent_nltkCorpus.xlsx'))
voca.head()