《Python自然语言处理》第三章练习题答案
·
第三章
这本书网上能找到的答案很少。
这一章主要涉及字符串处理、正则表达式、爬虫、列表推导等内容。
1
s = 'colorful'
print(s[:3]+'u'+s[3:])
2
s[-9]
4
s[::2]
5
s[::-1]
7
import re
string = "The purpose of this research was to create a framework of indicators that enabled us to measure the classic dimensions of sustainable development (SD): People, Planet, and Profit, in combination with the sustainability of the heritage values and the policy dimension. Methods developed as an approach to sustainable urban planning and that were based on system analysis"
re.findall(r'a|an|the',string)
string = "The purpose of this research was to create 2+3*7"
re.search(r'\d+([+*]\d)*',string)
8
from urllib import request
with request.urlopen('https://news.sina.com.cn/gov/xlxw/2021-03-18/doc-ikknscsi7715675.shtml') as f:
data = f.read()
print('Status:', f.status, f.reason)
for k, v in f.getheaders():
print('%s: %s' % (k, v))
print('Data:', data.decode('utf-8'))
without_html = re.sub(r'<(\S*?)[^>]*>.*?|<.*? />','',data.decode('utf-8'))
without_space = re.sub(r'^\s*|\s*$','',without_html)
9
def load(f):
try:
with open(f,encoding='utf8') as f:
data = f.read()
except:
with open(f,encoding='gbk') as f:
data = f.read()
return re.findall( r'[\。\,\;\“\’\?\(\)\:\-\——\`]',data)
pattern = re.compile(r"""(?x)
([A-Z]\.)+
|[A-Z][a-z]*\s[A-Z][a-z]*
|\$?\d+(\.\d+)?%?
|\d+\-\d+\-\d+
""",re.X)
text='That U.S.A 2011-2-3 soub $12.22'
res=pattern.findall(text)
print(res)
10
sent=['ans','sjs','aaa']
res = [(w,len(w)) for w in sent]
res
11
raw = 'aksjd askjdn eurs snnd'
raw.split('s')
12
for w in raw.split(' '):
print(w)
13
#split() 与 split(' ')的区别是split()默认所有空格和制表符作为分隔符,而split(' ')只以一个空格作为分隔符
raw='skjkaj sass \t'
raw.split()
raw.split(' ')
14
import jieba
'''
sort()方法是针对列表类型的变量的一种排序方法,使用时:比如列表类型的变量ls, ls.sort()
sorted()方法是针对所有类型的变量,比sort()使用范围更广,使用时:sorted(ls)
'''
sent='people, Xi extended sincere greetings and best wishes to Bangladeshi President Abdul Hamid, Bangladeshi Prime Minister Sheikh Hasina, and the Bangladeshi government and people. (Xinhua/Li Xueren)'
words = sent.split()
words.sort()
sorted(words)
15
"3"*7
3*7
16
%load monty.py
monty = 'Monty python'
17
string = 'ashshser alskdn eeee'
print('%a-6s' % string)
18
from nltk.corpus import brown
words = brown.words()
whs = []
for w in words:
if re.match(r'[wW]h[A-Za-z]+',w):
whs.append(w)
set(whs)
20
import requests
import bs4
import re
url='http://news.weather.com.cn/2021/03/3449612.shtml'
response = requests.get(url)
response.encoding='utf-8'
soup = bs4.BeautifulSoup(response.text,'html.parser')
for i in soup.findAll(name='div',attrs = {'class':'articleBody'}):
print(i.get_text())
21
from nltk.corpus import words
nltk_words = words.words()
def unknown(url):
unknown_words = []
cont = ''
response = requests.get(url)
response.encoding='utf-8'
soup = bs4.BeautifulSoup(response.text,'html.parser')
for i in soup.findAll(name='p'):
cont += i.get_text()
sp_words = cont.split(' ')
for w in sp_words:
w = re.findall(r'[A-Za-z]+',w)
if w:
w = w[0].lower()
if w not in nltk_words and w != '':
unknown_words.append(w)
return unknown_words
unknown('http://www.chinadaily.com.cn/a/202103/19/WS6053fedba31024ad0bab0310.html')
23
print(re.sub('don\'t','do n\'t',"don't touch me"))
print(re.sub('don\'t','do n\'t',"《don't|\w+》"))#因为原来句子由转义符所以不工作
24
def word_token(string):
w = re.sub('e','3',string)
w = re.sub('i','1',w)
w = re.sub('o','0',w)
w = re.sub('1','|',w)
w = re.sub('s','5',w)
w = re.sub('\.','5w33t!',w)
w = re.sub('ate','8',w)
return w
word_token('esjioa1s.ppate')
def word_token(string):
if string[0]=='s':
w = '$'+re.sub('s','5',string[1:])
elif 's' in string:
w = re.sub('s','5',string)
return w
word_token('shs')
25
def word_transfer(string):
words = string.split(' ')
trans_words = []
for word in words:
if re.match(r'[^aeiou]+',word) and re.match(r'[^A-Z]+',word):
fu = re.match(r'[^aeiou]+',word)
yuan = re.search(r'[aeiou](.*)',word)
trans_words.append(yuan.group(0)+fu.group(0)+'ay')
else:
trans_words.append(word)
return ' '.join(trans_words)
word_transfer('Pig Latin string idle')
27
import random
def generaize_sents(num):
words = []
for i in range(num):
words.append(random.choice('aehh '))
return ''.join(words)
generaize_sents(500)
29
from nltk.corpus import brown
for cate in brown.categories():
words = brown.words(categories=cate)
word_sum = 0
for w in words:
word_sum += len(w)
avg_word = word_sum/len(words)
sents = brown.sents(categories=cate)
sent_sum = 0
for sent in sents:
sent_sum += len(sent)
avg_sent = sent_sum/len(sents)
print(cate+":"+str(4.71*avg_word+0.5*avg_sent-21.43))
30
import nltk
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(w) for w in brown.words(categories='belles_lettres')][:50])
print([lancaster.stem(w) for w in brown.words(categories='belles_lettres')][:50])
33
'inexxpersjsnk'.index('sj')
words = brown.words(categories='belles_lettres')[:20]
words.index('They')
words[:words.index('They')]
38
def multi_raw(string):
words = string.split(' ')
n_w=''
for w in words:
w = ''.join(re.findall('\S',w))
if re.sub(r'\-','',w) in nltk.corpus.words.words():
w = re.sub(r'\-','',w)
n_w =n_w+' '+w
return n_w
multi_raw("habe long-\nterm encyclo-\npedia")
41
words = ''.join([random.choice('abcudefghijklmnop ') for _ in range(100)]).split()
res = [''.join([w for w in word if w in 'aeiou' ]) for word in words ]
魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐


所有评论(0)