安装繁简转换函式库
pip install hanziconv
在昨天的分类中,把简体评论的改成繁体。
import pandas as pd
import os
from hanziconv import HanziConv
all_df = pd.read_csv("ChnSentiCorp_htl_all.csv")
shuffled = all_df.sample(frac=1).reset_index(drop=True)
train_df = shuffled.iloc[:int(len(shuffled)*0.8)]
test_df = shuffled.iloc[int(len(shuffled)*0.8):]
mypaths = ["chinese/train/neg", "chinese/train/pos", "chinese/test/neg", "chinese/test/pos"]
for i in mypaths:
os.makedirs(i, exist_ok=True)
for i, row in train_df.iterrows():
if row["label"] == 1:
with open("chinese/train/pos/" + str(i) + ".txt", "w", encoding="UTF-8") as f:
f.write(HanziConv.toTraditional(str(row["review"])))
if row["label"] == 0:
with open("chinese/train/neg/" + str(i) + ".txt", "w", encoding="UTF-8") as f:
f.write(HanziConv.toTraditional(str(row["review"])))
for i, row in test_df.iterrows():
if row["label"] == 1:
with open("chinese/test/pos/" + str(i) + ".txt", "w", encoding="UTF-8") as f:
f.write(HanziConv.toTraditional(str(row["review"])))
if row["label"] == 0:
with open("chinese/test/neg/" + str(i) + ".txt", "w", encoding="UTF-8") as f:
f.write(HanziConv.toTraditional(str(row["review"])))
将 tf.keras.preprocessing.text_dataset_from_directory 读取的资料夹从 aclImdb 改为 我们刚才分好的 chinese
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'chinese/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
'chinese/train',
batch_size=batch_size,
validation_split=0.2,
subset='validation',
seed=seed)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = tf.keras.preprocessing.text_dataset_from_directory(
'chinese/test',
batch_size=batch_size)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
最後,将使用的 bert 模型,从 en (英文)
转为 multi_cased (多语言)
<<: 卡夫卡的藏书阁【Book25】- Kafka - KafkaJS Admin 2
>>: 自动化 End-End 测试 Nightwatch.js 与 BrowserStack
登入登出检核 之前已经针对登入登出进行控管,非登入无法进入会员页,按照此网站的需求,订单建置、订单查...
1.NPM版本 无须更新到最新,怕错误 2.制作专案package.json npm init np...
Synology 虽然提供很方便的 QuickConnect 可让用户端应用程序透过网际网路连线至 ...
前言 今天我们以开发者的角度,实际走过 GitOps 的工作流程,这次 Lab 准备了 NodeJS...
今天分享程序码从github更新到replit的步骤还有要注意的点 步骤 replit有自带储存环境...