- 小菜G
-
"""
NLP命名体识别bilstm+crf
1、准备数据:origin_handle_entities()
读取源数据文件,把人名,地名,机构名合并起来
2、读取处理后的数据:origin_handle_mark()
把预处理后的的文本标注成BMO的格式,
B(begin)、M(middle)、E(end)、O(other)
3、句子切分:sentence_split()
按照指定的格式,比如标点等内容对数据完成切分
4、保存数据
a.将标注的句子拆分自成列表和对应的标注序列
b.创建词汇表和标签
c.文本的向量化表示
d.划分训练集和测试集
e.保存成二进制pkl文件
5、加载数据
6、训练模型BiLSTM&HMM
7、保存训练后的模型用于预测
8、预测
"""
import codecs
import re
import collections
import pickle
import TorchCRF as CRF
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences #使用tensorflow的pad_sequences进行数据对齐 tensorflow2.3.1
from sklearn.model_selection import train_test_split
def origin_handle_entities():
with open("renmin.txt","r",encoding="utf-8") as inp,
open("middle/renmin2.txt","w",encoding="utf-8")
as outp:
#读取源文件中的数据
for line in inp.readlines():
#按照空格切分
line = line.split(" ")
i = 1
while i < len(line) - 1:
if line[i][0] == "[":
outp.write(line[i].split("/")[0][1:])
i += 1
while i < len(line) - 1 and line[i].find("]") == -1:
if line[i] !="":
#print(line[i].split("/")[0])
outp.write(line[i].split("/")[0])
i += 1
outp.write(line[i].split("/")[0].strip()+"/"+line[i])
elif line[i].split("/")[1] == "nr":
word = line[i].split("/")[0]
i += 1
if i < len(line) - 1 and line[i].split("/")[1] == "nr":
outp.write(word + line[i].split("/")[0] + "nr")
else:
outp.write(word + "/nr ")
continue
else:
outp.write(line[i] + "/no ")
i += 1
outp.write(" ")
import codecs
def origin_handle_mark():
"""
1、读取数据预处理后的renmin2.txt
2、将标注好的数据写入renmin3.txt
a.打开输入和输出文件
b.遍历输入文件renmin2.txt
:return:
"""
with codecs.open("middle/renmin2.txt","r",encoding="utf-8") as inp,
codecs.open("middle/renmin3.txt","w",encoding="utf-8") as outp:
#########句子切分###################################
import re
def sentence_split():
with codecs.open("middel/renmin3.txt","r",encoding="utf-8") as inp,
codecs.open("middle/renmin4.txt","w",encoding="utf-8") as outp:
#文本文件的内容设置为对应的utf-8编码,python3:先encode,再decode
texts = inp.read().encode("utf-8").decode("utf-8")
#切分句子
sentences =
re.split("[,。!?、"""":]/[0]".encode("utf-8").decode("utf-8"),
texts)
for sentence in sentences:
if sentence != " ":
outp.write(sentence.strip() + " ")
def data_to_pkl():
"""
将文本数据保存成二进制pkl文件
:return:
"""
def main():
# 数据清洗
origin_handle_entities()
#数据标注(字)
origin_handle_mark()
# 句子切分
sentence_split()
# 数据转换
data_to_pkl()
if name == " main ":
main()
##################################################################################################
def load_data():
pickle_path = "../data_target_pkl/renmindata.pkl"
with open(pickle_path,"rb") as inp:
word2id,id2word,tag2id,id2tag,x_train,y_train,x_test,y_test,x_valid,y_valid =pickle.load(inp)
def main():
word2id = load_data()
print(len(word2id))
if name == " main ":
main()
#######################################################################################
import torch
import torch.nn as nn
from torch.utils.data import Dataset # 批量读取数据
class NERDataSet(Dataset):
"""
X:表示样本,Y:表示标签
"""
def init (self,X,Y, args, *kwargs):
"""
class Config():
embedding_dim = 100 #词向量的维度
hidden_dim = 200
config = Config()
class NERLSTM_CRF(nn.Module):
"""
1、输入层
2、词映射(Embedding(vocab_size,embedding_dim))
3、LSTM
4、全连接层
"""
def init (self):
super(NERLSTM_CRF,self). init ()
self.embeding_dim = config.embeding_dim
self.hidden_dim = config.hidden_dim
self.vocab_size = config.vocab_size
self.num_tags = config.num_tags
##################################################
from torch.utils.data import DataLoader #批量加载数据
import torch
import torch.optim as op
def utils_to_train():
device = torch.device("cpu")
max_epoch = 1
batch_size = 32
num_workers =4 #开启几个线程取执行程序
def parse_tags(text,path):
id2tag = load_data()
tags = [id2tag[idx] for idx in path]
##################################################
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score
word2id = load_data()[0]
max_epoch,device,train_data_loader,valid_data_loader,test_data_loader,model = utils_to_train()
class ChineseNER(object):
def train(self):
for epoch in range(max_epoch):