Bert

DeepLearning

发布日期: 2024-03-23

BERT（Bidirectional Encoder Representations from Transformers）是一种基于Transformer架构的预训练语言模型，由Google在2018年提出。BERT的设计目标是通过大规模的无监督训练来学习通用的语言表示，从而在各种自然语言处理（NLP）任务中取得优秀的性能。

BERT的核心创新是使用了双向（bidirectional）的上下文建模方法。与传统的语言模型只考虑上下文的一侧不同，BERT通过利用Transformer的自注意机制，同时考虑左侧和右侧的上下文信息，从而能够更好地理解词语在不同上下文中的含义。

BERT的训练过程包括两个阶段：预训练和微调。

在预训练阶段，模型使用大规模的无标签文本数据进行训练，通过任务称为”掩码语言建模”和”下一句预测”来学习词语和句子级别的语义表示。在掩码语言建模任务中，BERT会随机掩盖输入句子中的一些词语，然后尝试预测这些被掩盖的词语。在下一句预测任务中，BERT会判断两个句子是否是原文中相邻的句子。

在微调阶段，BERT模型会在特定的下游任务上进行有监督的训练。通过在特定任务上微调预训练模型的参数，BERT可以适应各种具体的NLP任务，如文本分类、命名实体识别、问答等。在微调过程中，BERT模型的上层结构会被添加或修改，同时使用有标签的数据进行监督学习。

BERT的优势在于它能够通过预训练阶段学习到通用的语言表示，从而可以在不同的NLP任务上进行迁移学习。它在多个基准数据集上都取得了卓越的性能，甚至在某些任务上超过了人类水平。BERT的开放源代码和预训练模型使得研究人员和开发者可以轻松使用和扩展这一强大的语言模型。

Bert的pre-train过程从自身输入中取出一部分作为label,即目标输出，外部已知取出部分，练习模型填空的能力。事实上，bert不仅在语言处理领域，在音频影像的处理上也可以有较好的效果。

Bert的实现填空的工作流程

对前后文的向量计算得到概率最大的选项

Bert的预训练周期长，所需的数据量庞大

将输入文本传递给预训练好的 BERT 模型，模型会将每个词转换为对应的向量表示，这些向量表示可以用作输入文本的表示或特征，用于下游任务。即用Bert模型实现文本转embedding,代替DKT模型原有的one-hot输入表示，希望获得更好的预测效果。

获取文本

def getContext(record):
    ls = ["", "A", "B", "C", "D"]
    option = list()
    option.append(record["A"])
    option.append(record["B"])
    option.append(record["C"])
    option.append(record["D"])
    ans = []
    for index in range(4):
        ans.append(ls[index + 1] + ":" + option[index])
    return record["context"] + "\n" + "; ".join(ans)


def getSkill(subjectnames, record):
    record = record[1:-1].split(", ")
    #record是一个列表，里面长这样：[3, 49, 54, 258]
    ans = []
    for rr in record:
        ans.append("[" + subjectnames[rr] + "]")
        #subjectnames中键是id对应的键值是subjectname
        #ans中全部是类似["Volume and Surface Area"]的元素
    return "; ".join(ans)
    #会用“; ”的方式隔开字符串数组的元素，形成一个长字符串"[subject1];[subject2]……[subjectn]";


def getContextAndSkill(subjectnames, record):
    context = record["Context"] + "\n"
    context = context + "A: " + record["Option_A"] + "; B: "+record["Option_B"]
    context = context + "; C: " + record["Option_C"] + "; D: " + record["Option_D"]
    skill = record["SubjectId"]
    skill = skill[1:-1].split(", ")
    ans = []
    for rr in skill:
        ans.append("[" + subjectnames[rr] + "]")
    return context + "\nRelated concepts: " + "; ".join(ans)
    #一长串字符串"contest\nRelated concepts:[subject1];[subject2]……[subjectn]"


def qid2context(filename):
    res = dict()
    cnt = 0
    with open(filename, "r") as file:
        dictTemp = json.load(fp=file)
        for line in dictTemp["RECORDS"]:
            qid = line["ID"]
            res[qid] = getContext(line)
            #每个res的ID对应了它的题目+所有选项的文本
            # print(res[qid])
            cnt += 1
            # if cnt > 1:
            #     break
    # print(cnt)
    return res


def qid2skill(filenamepath, filesubjectname):
    cnt = 0
    subjectnames = dict()
    with open(filesubjectname, "r") as file:
        dictTemp = json.load(fp=file)
        for line in dictTemp["RECORDS"]:
            qid = line["SubjectId"]
            subjectnames[qid] = line["Name"]
            # subjectnames[100]="Volume and Surface Area" 建立了id和subjectname的映射表
    res = dict()
    filename = filenamepath + "/Train_Question_Response_Records_Ordered.json"
    with open(filename, "r") as file:
        dictTemp = json.load(fp=file)
        for line in dictTemp["RECORDS"]:
        #每个line里都是{
        #    "AnswerId": "878344",
        #    "QuestionId": "10",
        #    "UserId": "0",
        #    "CorrectAnswer": "1",
        #    "AnswerValue": "1",
        #    "IsCorrect": "1",
        #    "Context": "What is the equation of this line of symmetry? pic10_0",
        #    "Option_A": "y=-3.5",
        #    "Option_B": "y=-7",
        #    "Option_C": "y=-2",
        #    "Option_D": "Not possible to work out",
        #    "GroupId": "216",
        #    "QuizId": "55",
        #    "SubjectId": "[3, 49, 54, 258]",
        #    "DateAnswered": "2020-04-22 10:28:00.000"
        #}
            qid = line["QuestionId"]
            res[qid] = getSkill(subjectnames, line["SubjectId"])
            #subjectnames是我们创建的字典，根据id可查到对应的subjectname，line["subjectId"]里是[3, 49, 54, 258]的内容。
            # print(res[qid])
            cnt += 1
            # if cnt > 1:
            #     break
    filename = filenamepath + "/Test_Question_Response_Records_Ordered.json"
    with open(filename, "r") as file:
        dictTemp = json.load(fp=file)
        for line in dictTemp["RECORDS"]:
            qid = line["QuestionId"]
            res[qid] = getSkill(subjectnames, line["SubjectId"])
            # print(res[qid])
            cnt += 1
            # if cnt > 1:
            #     break
    # print(cnt)
    return res
    # res键ID对应的值还是字符串


def qid2contextandskill(filenamepath, filesubjectname):
    cnt = 0
    subjectnames = dict()
    with open(filesubjectname, "r") as file:
        dictTemp = json.load(fp=file)
        for line in dictTemp["RECORDS"]:
            qid = line["SubjectId"]
            subjectnames[qid] = line["Name"]
    res = dict()
    filename = filenamepath + "/Train_Question_Response_Records_Ordered.json"
    with open(filename, "r") as file:
        dictTemp = json.load(fp=file)
        for line in dictTemp["RECORDS"]:
            qid = line["QuestionId"]
            res[qid] = getContextAndSkill(subjectnames, line)
            # print(res[qid])
            cnt += 1
            # if cnt > 1:
            #     break
    filename = filenamepath + "/Test_Question_Response_Records_Ordered.json"
    with open(filename, "r") as file:
        dictTemp = json.load(fp=file)
        for line in dictTemp["RECORDS"]:
            qid = line["QuestionId"]
            res[qid] = getContextAndSkill(subjectnames, line)
            # print(res[qid])
            cnt += 1
            # if cnt > 1:
            #     break
    # print(cnt)
    return res

使用Bert模型实现文本转embedding所需要的代码

"""
1.from_pretrained()方法是类PreTrainedModel的一个方法
2.这里的如果加上 output_hidden_states=True，那么就会把所有的hidden_states 给输出
如果没有加，那么就只能得到最后一个隐层的输出。
"""
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(["Hello", "How are you?", "What's your name?"], padding=True, return_tensors='pt')

"""
执行model[BertModel实例]的forward()方法，但是在执行之前，仍然做了很多其他的事情
在 Python 中，** 是解包操作符，它可以将一个字典中的键值对解包成关键字参数。
这意味着 model(**inputs) 实际上相当于 model(input_ids=..., attention_mask=..., ...)，
其中 ... 是 inputs 字典中的各种键值对
outputs第一个元素是模型的输出（通常是预测值），第二个元素是池化后的输出（通常是用于分类任务的输出），
而后续的元素通常是模型的隐藏状态，用于一些特定的任务或者后续的特征提取
"""

with torch.no_grad():  # 关闭自动求导功能可以节省内存并提高计算速度
    outputs = model(**inputs)
    last_hidden_states = outputs[0]
    cls_embedding = outputs.last_hidden_state[:, 0, :]

"""
1.如果我的句子是 "hello,my dog is cute"，那么得到last_hidden_state 的size 
就是torch.Size([1, 8, 768])；如果我的句子是"hello,my dog"，那么得到的last_hidden_state
的 就是 torch.size([1,6,768])。也就是中间那个维度的大小是跟句子长度有关系
"""
# print(last_hidden_states)
print(last_hidden_states.size())
print(cls_embedding.size())

"""
inputs 是个字典，的内容如下：
{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]]), 
'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])
}
"""

与原输入的区别

          for j in range(len(problem_ids) - 1):
              # problem_id = int(problem_ids[j])
              problem_id = problem_ids[j]
              # print(problem_id)
              context.append(convert_res[problem_id])
              # num_steps：每个学生所做题目数量的最大值 num_skills：最大题号
              # 为了获取logits中目标的位置
              target_id.append(i * num_steps * num_skills + j * num_skills + int(problem_ids[j + 1]))
              target_correctness.append(int(correctness[j + 1]))
              actual_labels.append(int(correctness[j + 1]))
          problem_id = problem_ids[len(problem_ids) - 1]
          context.append(convert_res[problem_id])
          inputs = tokenizer(context, return_tensors='pt', padding=True)
          with torch.no_grad():  # 关闭自动求导功能可以节省内存并提高计算速度
              outputs = model(**inputs)
              cls_embedding = outputs.last_hidden_state[:, 0, :]
          # print(len(context), cls_embedding.size(),)
          all_cls_embeddings.append(cls_embedding)
      # 使用 torch.stack() 方法将所有的 cls_embedding 组合起来
      all_cls_embeddings = torch.stack(all_cls_embeddings, dim=0)
      # 输出结果
      # print(all_cls_embeddings.size())  # 应该是 [batch_size, context_size, hidden_size]
      # index指向下一组batch
      index += batch_size
      count += 1
      target_id = torch.tensor(target_id, dtype=torch.int64)
      target_correctness = torch.tensor(target_correctness, dtype=torch.float)
      input_data = all_cls_embeddings
      
      
      
####################################################################################      
      
                  for j in range(len(problem_ids)-1):
              problem_id = int(problem_ids[j])
              # label_index = 0
              if int(correctness[j]) == 0:
                  label_index = problem_id
              else:
                  label_index = problem_id + num_skills
              x[i, j] = label_index
              target_id.append(i*num_steps*num_skills+j*num_skills+int(problem_ids[j+1]))
              target_correctness.append(int(correctness[j+1]))
              actual_labels.append(int(correctness[j+1]))
      # index指向下一组batch
      index += batch_size
      count += 1
      target_id = torch.tensor(target_id, dtype=torch.int64)
      target_correctness = torch.tensor(target_correctness, dtype=torch.float)

      # One Hot encoding input data [batch_size, num_steps, input_size]
      # x代表每个学生回答每个问题的正误情况
      x = torch.tensor(x, dtype=torch.int64)
      # unsqueeze:扩充数据维度，在0起的指定位置N加上维数为一的维度，这里的x本来是两维，现在变为3维
      x = torch.unsqueeze(x, 2)
      input_data = torch.FloatTensor(batch_size, num_steps, input_size)
      input_data.zero_()
      # 在input_data的第三维找到x中所记录的位置，并将之修改为1
      input_data.scatter_(2, x, 1)

一些数据结构

（题目属性）subject_metadata.json


{"RECORDS": [
    {
        "SubjectId": "100",
        "Name": "Volume and Surface Area",
        "ParentId": "71",
        "Level": "2"
    },
    {
        "SubjectId": "101",
        "Name": "Data and Statistics",
        "ParentId": "3",
        "Level": "1"
    },
    {
        "SubjectId": "102",
        "Name": "Averages (mean, median, mode) from a List of Data",
        "ParentId": "338",
        "Level": "3"
    },
    {
        "SubjectId": "103",
        "Name": "Averages and Range from Grouped Data",
        "ParentId": "338",
        "Level": "3"
    },
    {
        "SubjectId": "104",
        "Name": "Box Plots",
        "ParentId": "342",
        "Level": "3"
    },
    {
        "SubjectId": "105",
        "Name": "Cumulative Frequency Diagram",
        "ParentId": "342",
        "Level": "3"
    },
    ]
    }

（题目文本）question_context.json

{"RECORDS": [
    {
        "ID": "0",
        "context": "\"If you multiply a square number by 9, you get a square number\" Is this statement...",
        "A": "always true",
        "B": "sometimes true",
        "C": "never true",
        "D": "Impossible to say"
    },
    {
        "ID": "1",
        "context": "How much bigger is 3/8 than 1/3?",
        "A": "2/24",
        "B": "2/5",
        "C": "1/24",
        "D": "4/11"
    },
    {
        "ID": "10",
        "context": "What is the equation of this line of symmetry? pic10_0",
        "A": "y=-3.5",
        "B": "y=-7",
        "C": "y=-2",
        "D": "Not possible to work out"
    },
    {
        "ID": "100",
        "context": "This is the first row of a factor tree.What number should replace the question mark? pic100_0",
        "A": "8",
        "B": "12",
        "C": "21",
        "D": "72"
    },
    ]
    }