Working with text in Python

Original Source: https://www.coursera.org/specializations/data-science-python

텍스트 관련 기본 내장 함수

text = " Ethics are built into the ideals of the UN here. \n지금 시각은 3 시 30분이다. "
text

' Ethics are built into the ideals of the UN here. \n지금 시각은 3 시 30분이다. '

1. String Operations

len(text) #문자열의 길이를 반환한다.

text_list = text.split(' ') # 문자열을 ' '로 나눈 단어들의 리스트를 반환한다.
text_list

['',
 'Ethics',
 'are',
 'built',
 'into',
 'the',
 'ideals',
 'of',
 'the',
 'UN',
 'here.',
 '\n지금',
 '시각은',
 '3',
 '시',
 '30분이다.',
 '']

' '.join(text_list) # 문자열들을 ' '을 사이에 끼고 연결한다.

' Ethics are built into the ideals of the UN here. \n지금 시각은 3 시 30분이다. '

text.lower() #change all characters to lower

' ethics are built into the ideals of the un here. \n지금 시각은 3 시 30분이다. '

text.upper() #모두 다 대문자로 바꾼다.

' ETHICS ARE BUILT INTO THE IDEALS OF THE UN HERE. \n지금 시각은 3 시 30분이다. '

text.splitlines() # \n을 기준으로 나눈 문장들의 리스트를 반환한다.

[' Ethics are built into the ideals of the UN here. ', '지금 시각은 3 시 30분이다. ']

text.strip() # 문장 맨 앞과 맨 뒤에 있는 빈 공간을 모두 없앤다.

'Ethics are built into the ideals of the UN here. \n지금 시각은 3 시 30분이다.'

text.rstrip() # 문장 맨 뒤에 있는 빈 공간을 없앤다.

' Ethics are built into the ideals of the UN here. \n지금 시각은 3 시 30분이다.'

text.find('the') # 문자열에 있는 'the' 중 가장 앞에 있는 것의 인덱스를 반환한다.

text.rfind('the') #문자열에 있는 'the' 중 가장 뒤에 있는 것의 인덱스를 반환한다.

text.replace(' ', '빈칸') #문자열에 있는 모든 ' '를 '빈칸'으로 바꾼 문자열을 반환한다.

'빈칸Ethics빈칸are빈칸built빈칸into빈칸the빈칸ideals빈칸of빈칸the빈칸UN빈칸here.빈칸\n지금빈칸시각은빈칸3빈칸시빈칸30분이다.빈칸'

2. True / False를 반환하는 내장 함수

[w for w in text_list if w.startswith('i')] #'i'로 시작하는 단어들의 리스트를 반환한다.

['into', 'ideals']

[w for w in text_list if w.endswith('t')] #'t'로 끝나는 단어들의 리스트를 반환한다.

['built']

[w for w in text_list if w.istitle()] #맨 앞 글자만 대문자인 단어들의 리스트를 반환한다.

['Ethics']

[w for w in text_list if w.isupper()] #대문자로 이루어진 단어들의 리스트를 반환한다.

['UN']

[w for w in text_list if w.islower()] #소문자로 이루어진 단어들의 리스트를 반환한다.

['are', 'built', 'into', 'the', 'ideals', 'of', 'the', 'here.']

[w for w in text_list if w.isalpha()] #숫자가 아닌 글자로 이루어진 단어들의 리스트를 반환한다.

['Ethics',
 'are',
 'built',
 'into',
 'the',
 'ideals',
 'of',
 'the',
 'UN',
 '시각은',
 '시']

[w for w in text_list if w.isdigit()] #숫자로 이루어진 단어들의 리스트를 반환한다.

['3']

[w for w in text_list if w.isalnum()] #특수문자를 제외한 숫자와 글자로 이루어진 단어들의 리스트를 반환한다.

['Ethics',
 'are',
 'built',
 'into',
 'the',
 'ideals',
 'of',
 'the',
 'UN',
 '시각은',
 '3',
 '시']

3. 텍스트 파일 읽기

f = open('1.txt', 'r') # 파일을 열어 작업할 준비를 한다.
f.readline() # 파일의 첫 번째 줄을 읽는다. 현재 커서는 두 번째 줄 맨 앞에 있다.

'안녕하세요.\n'

f.seek(0) # 커서를 첫 번째 줄 맨 앞에 놓는다.
f.read() # 파일의 모든 줄을 읽는다.

'안녕하세요.\n123456\n?!@#가나다'

f.seek(0)
f.read(3) # 3번째 글자까지 읽는다.

'안녕하'

f.close() # 파일을 닫는다.

pandas로 텍스트 처리하기

import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.",
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

	text
0	Monday: The doctor's appointment is at 2:45pm.
1	Tuesday: The dentist's appointment is at 11:30...
2	Wednesday: At 7:00pm, there is a basketball game!
3	Thursday: Be back home by 11:15 pm at the latest.
4	Friday: Take the train at 08:10 am, arrive at ...

# df['text']의 각 문자열의 길이의 리스트를 반환한다.
# Series.str.function()은 시리즈의 각 문자열에 function()을 적용한 결과를 반환한다.
df['text'].str.len()

  46
  50
  49
  49
  54
Name: text, dtype: int64

# df['text']의 각 문자열의 단어 개수의 리스트를 반환한다.
df['text'].str.split().str.len()

   7
   8
   8
  10
  10
Name: text, dtype: int64

# 특정 단어를 포함하고 있는지의 여부를 반환한다.
df['text'].str.contains('appointment')

   True
   True
  False
  False
  False
Name: text, dtype: bool

# 각 문자열의 숫자 개수를 반환한다.
#r은 'regular expression', 즉 정규표현식으로 찾겠다는 뜻이고, '\d'는 숫자를 의미하는 정규표현식이다.
df['text'].str.count(r'\d')

  3
  4
  3
  4
  8
Name: text, dtype: int64

# 각 문자열에서 정규표현식을 만족하는 부분 중 정규표현식에서 괄호 안에 있는 부분을 반환한다.
# 여기서는 시각과 분을 반환한다.
df['text'].str.findall(r'(\d?\d):(\d\d)')

             [(2, 45)]
            [(11, 30)]
             [(7, 00)]
            [(11, 15)]
  [(08, 10), (09, 00)]
Name: text, dtype: object

# 각 문자열에서 요일을 3 글자 축약형으로 바꾼다.
df['text'].str.replace(r'(\w+day\b)', lambda x: x.group()[:3])

        Mon: The doctor's appointment is at 2:45pm.
     Tue: The dentist's appointment is at 11:30 am.
        Wed: At 7:00pm, there is a basketball game!
       Thu: Be back home by 11:15 pm at the latest.
  Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

# ?P<time>은 해당 부분의 열 이름을 'time'으로 하겠다는 것이다.
# 정규표현식을 만족하는 부분 중 정규표현식에서 괄호 안에 있는 부분을 반환한다.
# 한 문장에 두 부분이 정규표현식을 만족시킬 수 있으므로, match라는 두 번째 index가 만들어진다.
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

		time	hour	minute	period
	match
0	0	2:45pm	2	45	pm
1	0	11:30 am	11	30	am
2	0	7:00pm	7	00	pm
3	0	11:15 pm	11	15	pm
4	0	08:10 am	08	10	am
4	1	09:00am	09	00	am

Share on

Twitter Facebook Google+ LinkedIn

YoonSoo

Working with text in Python

텍스트 관련 기본 내장 함수

1. String Operations

2. True / False를 반환하는 내장 함수

3. 텍스트 파일 읽기

pandas로 텍스트 처리하기

Share on

Leave a Comment

You May Also Enjoy

Generalized Linear Models (GLM)

“ALBERT: A Lite BERT for Self-supervised Learning of Language Representations” Summarized

“Generative Pretraining from Pixels” Summarized

“Language Models are Few-Shot Learners” Summarized