《Python自然语言处理》第三章练习题答案

heize19

1936人浏览 · 2021-03-19 19:26:59

heize19 · 2021-03-19 19:26:59 发布

第三章
这本书网上能找到的答案很少。
这一章主要涉及字符串处理、正则表达式、爬虫、列表推导等内容。
1

s = 'colorful'
print(s[:3]+'u'+s[3:])

s[-9]

s[::2]

s[::-1]

import re
string = "The purpose of this research was to create a framework of indicators that enabled us to measure the classic dimensions of sustainable development (SD): People, Planet, and Profit, in combination with the sustainability of the heritage values and the policy dimension. Methods developed as an approach to sustainable urban planning and that were based on system analysis"
re.findall(r'a|an|the',string)
string = "The purpose of this research was to create 2+3*7"
re.search(r'\d+([+*]\d)*',string)

from  urllib import request
with request.urlopen('https://news.sina.com.cn/gov/xlxw/2021-03-18/doc-ikknscsi7715675.shtml') as f:
    data = f.read()
    print('Status:', f.status, f.reason)
    for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', data.decode('utf-8'))
without_html = re.sub(r'<(\S*?)[^>]*>.*?|<.*? />','',data.decode('utf-8'))
without_space = re.sub(r'^\s*|\s*$','',without_html)

def load(f):
    try:
        with open(f,encoding='utf8') as f:
            data = f.read()
    except:
        with open(f,encoding='gbk') as f:
            data = f.read()

    return re.findall( r'[\。\，\；\“\’\？\(\)\：\-\——\`]',data)

pattern = re.compile(r"""(?x)
([A-Z]\.)+
|[A-Z][a-z]*\s[A-Z][a-z]*
|\$?\d+(\.\d+)?%?
|\d+\-\d+\-\d+
""",re.X)
text='That U.S.A 2011-2-3 soub $12.22'
res=pattern.findall(text)
print(res)

sent=['ans','sjs','aaa']
res = [(w,len(w)) for w in sent]
res

raw = 'aksjd askjdn eurs snnd'
raw.split('s')

for w in raw.split(' '):
    print(w)

#split() 与 split(' ')的区别是split()默认所有空格和制表符作为分隔符，而split(' ')只以一个空格作为分隔符
raw='skjkaj   sass  \t'
raw.split()
raw.split(' ')

import jieba
'''
sort()方法是针对列表类型的变量的一种排序方法，使用时：比如列表类型的变量ls, ls.sort()
sorted()方法是针对所有类型的变量，比sort()使用范围更广，使用时：sorted(ls)
'''
sent='people, Xi extended sincere greetings and best wishes to Bangladeshi President Abdul Hamid, Bangladeshi Prime Minister Sheikh Hasina, and the Bangladeshi government and people. (Xinhua/Li Xueren)'
words = sent.split()
words.sort()
sorted(words)

"3"*7
3*7

%load monty.py
monty = 'Monty python'

string = 'ashshser alskdn eeee'
print('%a-6s' % string)

from nltk.corpus import brown
words = brown.words()
whs = []
for w in words:
    if re.match(r'[wW]h[A-Za-z]+',w):
        whs.append(w)
set(whs)

import requests 
import bs4
import re 
url='http://news.weather.com.cn/2021/03/3449612.shtml'
response = requests.get(url)
response.encoding='utf-8'
soup = bs4.BeautifulSoup(response.text,'html.parser')
for i in soup.findAll(name='div',attrs = {'class':'articleBody'}):       
    print(i.get_text())

from nltk.corpus import words
nltk_words = words.words()
def unknown(url):
    unknown_words = []
    cont = ''
    response = requests.get(url)
    response.encoding='utf-8'
    soup = bs4.BeautifulSoup(response.text,'html.parser')
    for i in soup.findAll(name='p'):       
        cont += i.get_text()
    sp_words = cont.split(' ')
    for w in sp_words:
        w = re.findall(r'[A-Za-z]+',w)
        if w:
            w = w[0].lower()
            if w not in nltk_words and w != '':
                unknown_words.append(w)
    return unknown_words
unknown('http://www.chinadaily.com.cn/a/202103/19/WS6053fedba31024ad0bab0310.html')

print(re.sub('don\'t','do n\'t',"don't touch me"))
print(re.sub('don\'t','do n\'t',"《don't|\w+》"))#因为原来句子由转义符所以不工作

def word_token(string):
    w = re.sub('e','3',string)
    w = re.sub('i','1',w)
    w = re.sub('o','0',w)
    w = re.sub('1','|',w)
    w = re.sub('s','5',w)
    w = re.sub('\.','5w33t!',w)
    w = re.sub('ate','8',w)
    return w
word_token('esjioa1s.ppate')

def word_token(string):
    if string[0]=='s':
        w = '$'+re.sub('s','5',string[1:])
    elif 's' in string:
        w = re.sub('s','5',string)
    return w
word_token('shs')

def word_transfer(string):
    words = string.split(' ')
    trans_words = []
    for word in words:
        if re.match(r'[^aeiou]+',word) and re.match(r'[^A-Z]+',word):
            fu = re.match(r'[^aeiou]+',word)
            yuan = re.search(r'[aeiou](.*)',word)
            trans_words.append(yuan.group(0)+fu.group(0)+'ay')
        else:
            trans_words.append(word)
    return ' '.join(trans_words)
word_transfer('Pig Latin string idle')

import random
def generaize_sents(num):
    words = []
    for i in range(num):
        words.append(random.choice('aehh '))
    return ''.join(words)
generaize_sents(500)

from nltk.corpus import brown
for cate in brown.categories():
    words = brown.words(categories=cate)
    word_sum = 0
    for w in words:
        word_sum += len(w)
    avg_word = word_sum/len(words)
    
    sents = brown.sents(categories=cate)
    sent_sum = 0
    for sent in sents:
        sent_sum += len(sent)
    avg_sent = sent_sum/len(sents)
    
    print(cate+":"+str(4.71*avg_word+0.5*avg_sent-21.43))

import nltk
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(w) for w in brown.words(categories='belles_lettres')][:50])
print([lancaster.stem(w) for w in brown.words(categories='belles_lettres')][:50])

'inexxpersjsnk'.index('sj')

words = brown.words(categories='belles_lettres')[:20]
words.index('They')

words[:words.index('They')]

def multi_raw(string):
    words = string.split(' ')
    n_w=''
    for w in words:
        w = ''.join(re.findall('\S',w))
        if re.sub(r'\-','',w) in nltk.corpus.words.words():
            w = re.sub(r'\-','',w)
        n_w =n_w+' '+w
    return n_w
multi_raw("habe long-\nterm encyclo-\npedia")

words = ''.join([random.choice('abcudefghijklmnop ') for _ in range(100)]).split()
res = [''.join([w for w in word if w in 'aeiou' ]) for word in words ]

魔乐社区

魔乐社区（Modelers.cn) 是一个中立、公益的人工智能社区，提供人工智能工具、模型、数据的托管、展示与应用协同服务，为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作，由全产业链共同建设、共同运营、共同享有，推动国产AI生态繁荣发展。

更多推荐

小参数・大码力・易部署 | Qwen3.6-27B上线魔乐社区，基于昇腾的部署教程来了

继一周前模型开源发布后，千问再度开源Qwen3.6-27B —— 一个拥有270亿参数的稠密多模态模型，也是社区呼声最高的模型规格。Qwen3.6-27B 依然支持多模态思考与非思考模式，在智能体编程方面达到了旗舰级表现，全面超越前代开源旗舰 Qwen3.5-397B-A17B（总参数397B / 激活参数17B的MoE模型）。作为稠密架构，它无需MoE路由即可部署，是开发者在实用、可广泛部署规模