本文最后更新于 2024-03-08,文章内容可能已经过时。

背景:

为了搭建搭建疾病和症状相关的知识图谱

从电子病历中获得出入院信息进行NER识别之后得到名词实体列表

需要取出其中的症状,观察后发现有以下几类词,我们需要取出症状而不是正常

由于不能使用其他大模型,只能用目前开发的大模型,但准确率不高,所以需要写脚本辅助实现

左心功能低下

不带否定词的症状

生命体征正常

正常词

ALT1107U/L

医疗名称

无肝区明显不适

带否定词的正常词

头晕伴双侧下肢无力

带否定词的症状

实现:


if __name__ == "__main__":
    origin_file = "正式/ner_result.txt"
    normal_file = "正式/正常实体词.txt"
    abnormal_file = "正式/异常实体词.txt"
    unsure_file = "正式/不确定实体词.txt"

    unsure_flag = False

    normal_line = []
    abnormal_line = []
    unsure_line = []
    unsure1_line = []
    unsure2_line = []
    unsure3_line = []
    count = 0

    # 正常匹配
    re_normal_list = [
        r"^(?!.*诱因)(?=.*无)(?=.*明显).*$",
        r".*无.*障碍.*",
    ]

    # 异常匹配
    re_abnormal_list = [
        r".*无法.*",
        r".*无力.*",
        r".*无名指.*",
        r".*无.*好转",
        r"^(?!无明显诱因$).*无明显诱因.*",
    ]

    with open(origin_file, "r", encoding="utf-8") as f:
        i = 100
        # 用自定义的规则初步分异常和正常
        while True:
            normal_flag = ''
            abnormal_flag = ''

            # 少于3个字假如待定
            line = f.readline()
            if len(line) < 1:
                break

            # 选出异常
            for re_item in re_abnormal_list:
                match_obj = re.match(re_item, line, re.M | re.I)
                if match_obj is not None:
                    abnormal_line.append(line)
                    abnormal_flag = 'True'
                    break

            if abnormal_flag == 'True':
                continue

            # 选出正常
            for re_item in re_normal_list:
                match_obj = re.match(re_item, line, re.M | re.I)
                if match_obj is not None:
                    normal_line.append(line)
                    normal_flag = 'True'
                    break

            if normal_flag == 'True':
                continue

            # 加入待定
            unsure1_line.append(line)

        # while True:
        #     i -= 1
        #     # if i == 0:
        #     #     break
        #     line = f.readline()
        #     if len(line) < 3:
        #         break
        #
        for line in unsure1_line:

            normal_flag = ''
            abnormal_flag = ''

            # 不包含无的都待定
            index = line.find("无")

            # 分成左边和右边 
            part_line = line[:index]
            part_line2 = line[index:]
            # seg_list = jieba.cut(part_line, cut_all=False)
            # seg_list2 = jieba.cut(part_line2, cut_all=False)

            seg_list_left = []
            seg_list_right = []

            # 将无字两边的词分割出来
            seg_list_all = []
            if part_line not in seg_list_all:
                seg_list_left.append(part_line)
                seg_list_all.append(part_line)
            if part_line2 not in seg_list_all:
                seg_list_right.append(part_line2)
                seg_list_all.append(part_line2)

            # 右边为症状或疾病为正常,为部位为异常,左边为部位为正常,为疾病或症状为异常,如果都检测不到就待定
            if(index >= 0 and seg_list_right[0] != '' and seg_list_left[0] != ''):
                for seg in seg_list_right:
                        if line not in normal_line:
                            result = is_symptom(seg)
                            if result == 'normal':
                                normal_flag = 'True'
                            elif(result == 'abnormal'):
                                abnormal_flag = 'True'

                if(seg_list_left[0] != ''):
                    for seg in seg_list_left:
                        if line not in normal_line:
                            result = is_position(seg)
                            if result == 'normal':
                                normal_flag = 'True'
                            elif(result == 'abnormal'):
                                abnormal_flag = 'True'
                
                if(abnormal_flag == 'True'):
                    abnormal_line.append(line)
                    print(count, seg, 'abnormal')
                    count += 1
                    continue
                elif(normal_flag == 'True'):
                    normal_line.append(line)
                    print(count, seg.replace('\n', ''), 'normal')
                    count += 1
                    continue

            elif(index >= 0 and (seg_list_right[0] != '' or seg_list_left[0] != '')):
                if(seg_list_right[0] != ''):
                    for seg in seg_list_right:
                        if line not in normal_line:
                            result = is_symptom(seg)
                            if result == 'normal':
                                normal_flag = True
                                normal_line.append(line)
                                break
                            elif(result == 'abnormal'):
                                abnormal_flag = True
                                abnormal_line.append(line)
                        print(count, seg.replace('\n', ''), result)
                        count += 1

                if normal_flag or abnormal_flag:
                    continue

                if(seg_list_left[0] != ''):
                    for seg in seg_list_left:
                        if line not in normal_line:
                            result = is_position(seg)
                            if result == 'normal':
                                normal_flag = True
                                normal_line.append(line)
                                break
                            elif(result == 'abnormal'):
                                abnormal_flag = True
                                abnormal_line.append(line)
                        print(count, seg.replace('\n', ''), result)
                        count += 1

                if normal_flag or abnormal_flag:
                    continue
            else:
                if line not in normal_line:
                    result = is_position(line)
                    if result == 'normal':
                        normal_flag = True
                        normal_line.append(line)
                    elif(result == 'abnormal'):
                        abnormal_flag = True
                        abnormal_line.append(line)
                    print(count, line.replace('\n', ''), result)
                    count += 1
                
                if normal_flag or abnormal_flag:
                    continue

            # 带无但是待定的词
            if line not in normal_line:
                unsure_line.append(line)

        # while True:
        #     i -= 1
        #     # if i == 0:
        #     #     break
        #     line = f.readline()
        #     if len(line) <= 2:
        #         break
        #
        # 开头不为无的,或开头为无但长度大于2的加入待定,开头为无长度小等于2的加入正常

    with open(normal_file, "w", encoding="utf-8") as f:
        f.writelines(normal_line)

    with open(abnormal_file, "w", encoding="utf-8") as f:
        f.writelines(abnormal_line)

    with open(unsure_file, "w", encoding="utf-8") as f:
        f.writelines(unsure_line)

    exit(0)