【第21天】训练模型-模型组合与辨识isnull(二)

摘要

  1. 作业流程
  2. 设定资料集路径
  3. 找出每个中文字的阈值
  4. 任意选择奇数个模型组合後,产生模型权重表与利用新模型权重得到的机率表。
  5. 判断isnull

内容

  1. 作业流程(今日进度请参阅红框处)

  2. 设定资料集路径

    2.1 我们有7个模型,每个模型输出3个机率表(官方800字内、官方800字外、测试赛),共21个。

    2.2 机率表中有803个栏位,分别是1~800字机率、预测值、实际值及是否正确预测。

    2.3 程序码

    # 官方800字内机率表路径
    offical_in800_1 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/densenet201_v2_2/official_in_800.csv",fileEncoding="UTF-8-BOM")
    offical_in800_2 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/resnet152V2_v1_2/official_in_800.csv",fileEncoding="UTF-8-BOM")
    offical_in800_3 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/xception_v2_2/official_in_800.csv",fileEncoding="UTF-8-BOM")
    offical_in800_ex3 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/inceptionResNetV2_v1_2/official_in_800.csv",fileEncoding="UTF-8-BOM")
    offical_in800_ex4 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/densenet201_in800_official_韦智.csv")
    offical_in800_ex5 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/prob炫斐/official_in_800.csv",fileEncoding="UTF-8-BOM")
    offical_in800_ex6 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/swa_v2/swa_v2_in800_official_韦智.csv")
    
    # 官方800字内机率表之预测值与实际值
    names(offical_in800_1)[801:802] = c('predict_word',"origin_word")
    names(offical_in800_2)[801:802] = c('predict_word',"origin_word")
    names(offical_in800_3)[801:802] = c('predict_word',"origin_word")
    names(offical_in800_ex3)[801:802] = c('predict_word',"origin_word")
    names(offical_in800_ex4)[801:802] = c('predict_word',"origin_word")
    names(offical_in800_ex5)[801:802] = c('predict_word',"origin_word")
    names(offical_in800_ex6)[801:802] = c('predict_word',"origin_word")
    
    # 官方800字外机率表路径
    offical_noin800_1 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/densenet201_v2_2/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_2 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/resnet152V2_v1_2/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_3 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/xception_v2_2/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_ex3 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/inceptionResNetV2_v1_2/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_ex4 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/densenet201_notin800_official_韦智.csv")
    offical_noin800_ex5 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/prob炫斐/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_ex6 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/swa_v2/swa_v2_notin800_official_韦智.csv")
    
    # 测试赛
    offical_noin800_1 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/densenet201_v2_2/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_2 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/resnet152V2_v1_2/official_notin_800.csv",   fileEncoding="UTF-8-BOM")
    offical_noin800_3 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/xception_v2_2/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_ex3 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/inceptionResNetV2_v1_2/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_ex4 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/densenet201_notin800_official_韦智.csv")
    offical_noin800_ex5 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/prob炫斐/official_notin_800.csv",fileEncoding="UTF-8-BOM")
    offical_noin800_ex6 = read.csv(file = "C:/Users/wooden/Desktop/dl/probCSV/swa_v2/swa_v2_notin800_official_韦智.csv")
    
  3. 找出每个中文字的阈值

    3.1 定义function:找出阈值最小值&平均机率

    #function:各字准确度&最小值&平均机率(定义阈值)
    get_acc_min = function(data_prob){
      word = unique(names(data_prob)[1:800])
      n = NULL
      acc = NULL
      mean_prob = NULL
      min_prob = NULL
      for(i in 1:length(word)){
        tmp = data_prob[data_prob$origin_word == word[i],]
        n[i] = nrow(tmp)
        acc[i] = round(sum(tmp$predict_word == tmp$origin_word)/n[i],4)
        if(any(tmp$predict_word == tmp$origin_word)){
          min_prob[i] = min(as.numeric(tmp[tmp$predict_word == tmp$origin_word,which(names(tmp) == word[i])]))
          mean_prob[i] = mean(as.numeric(tmp[tmp$predict_word == tmp$origin_word,which(names(tmp) == word[i])]))
        }
        else{
          min_prob[i] = 0
          mean_prob[i] = 0
        }
      }
      data_summary = data.frame(word = word,acc = acc,min_prob = min_prob,n = n,mean_prob = mean_prob)
      return(data_summary)
    }  
    

    3.2. 找出800个字的阈值,汇整後储存CSV档案

    # 取得模型各字准确度&最小值&平均机率 
    offical_in800_1_summary = get_acc_min(offical_in800_1)
    offical_in800_2_summary = get_acc_min(offical_in800_2)
    offical_in800_3_summary = get_acc_min(offical_in800_3)
    offical_in800_ex3_summary = get_acc_min(offical_in800_ex3)
    offical_in800_ex4_summary = get_acc_min(offical_in800_ex4)
    offical_in800_ex5_summary = get_acc_min(offical_in800_ex5)
    offical_in800_ex6_summary = get_acc_min(offical_in800_ex6)
    
    # 赋值
    final = offical_in800_1_summary
    final$acc_2 = offical_in800_2_summary$acc
    final$min_prob_2 = offical_in800_2_summary$min_prob
    final$mean_prob_2 = offical_in800_2_summary$mean_prob
    final$acc_3 = offical_in800_3_summary$acc
    final$min_prob_3 = offical_in800_3_summary$min_prob
    final$mean_prob_3 = offical_in800_3_summary$mean_prob
    final$acc_ex3 = offical_in800_ex3_summary$acc
    final$min_prob_ex3 = offical_in800_ex3_summary$min_prob
    final$mean_prob_ex3 = offical_in800_ex3_summary$mean_prob
    final$acc_ex4 = offical_in800_ex4_summary$acc
    final$min_prob_ex4 = offical_in800_ex4_summary$min_prob
    final$mean_prob_ex4 = offical_in800_ex4_summary$mean_prob
    final$acc_ex5 = offical_in800_ex5_summary$acc
    final$min_prob_ex5 = offical_in800_ex5_summary$min_prob
    final$mean_prob_ex5 = offical_in800_ex5_summary$mean_prob
    final$acc_ex6 = offical_in800_ex6_summary$acc
    final$min_prob_ex6 = offical_in800_ex6_summary$min_prob
    final$mean_prob_ex6 = offical_in800_ex6_summary$mean_prob
    
    names(final) = c("word","acc_1",'min_prob_1',"n",'mean_prob_1',
                  "acc_2","min_prob_2",'mean_prob_2',
                  "acc_3","min_prob_3",'mean_prob_3',
                  "acc_ex3","min_prob_ex3",'mean_prob_ex3',
                  "acc_ex4","min_prob_ex4",'mean_prob_ex4',
                  "acc_ex5","min_prob_ex5",'mean_prob_ex5',
                  "acc_ex6","min_prob_ex6",'mean_prob_ex6')
    
    final = final[,c("word","n",
                      "acc_1","acc_2","acc_3","acc_ex3","acc_ex4","acc_ex5","acc_ex6",
                      'min_prob_1',"min_prob_2","min_prob_3","min_prob_ex3","min_prob_ex4","min_prob_ex5","min_prob_ex6",
                  'mean_prob_1','mean_prob_2','mean_prob_3','mean_prob_ex3','mean_prob_ex4','mean_prob_ex5','mean_prob_ex6'
    )]
    
    # 储存中文字标签+ 该字出现n次 + ACC*7 + min_prob_1*7 + mean_prob_1*7个模型
    write.csv(final,file = "C:/Users/wooden/Desktop/dl/model/model_weight_V3.csv",row.names = F)
    

    3.3 输出结果(以CSV档显示)

    • 栏位:中文字标签、该字出现n次、7个模型ACC、7个模型min_prob、7个模型mean_prob。
    • 表格内容
  4. 任意选择奇数个模型组合後,产生组合权重表,并利用模型权重得到新的机率表。

    4.1 定义function:任意组合模型(奇数个)。

    # 任意组合奇数个模型
    BitMatrix <- function(n){
      set <- 0:(2^n-1)
      rst <- matrix(0,ncol = n,nrow = 2^n)
      for (i in 1:n){
        rst[, i] = ifelse((set-rowSums(rst*rep(c(2^((n-1):0)), each=2^n)))/(2^(n-i))>=1, 1, 0)
      }
      rst
    }
    

    4.2 定义function:以官方800字内资料集机率表,组合模型後产出权重表。并利用模型权重得到新的机率表。

    get_new_model = function(namesmodel = c(1),stat = 'acc',dataset = "offical_in800"){
    
      new_stat = NULL
      for(i in 1:length(namesmodel)){
        if(stat == 'acc'){
          eval(parse(text = paste0("final$wei_",namesmodel[i]," = final$acc_",namesmodel[i],"/(",paste0('final$acc_',namesmodel,collapse = "+"),")")))
        }
        else{
          eval(parse(text = paste0("final$wei_",namesmodel[i]," = final$mean_prob_",namesmodel[i],"/(",paste0('final$mean_prob_',namesmodel,collapse = "+"),")")))
        }
        eval(parse(text = paste0("wei_matrix = matrix(final$wei_",namesmodel[i],",ncol = nrow(",dataset,"_",namesmodel[i],"),nrow = 800)")))
        wei_matrix = t(wei_matrix)
        if(i == 1){
          eval(parse(text = paste0("result = ",dataset,"_",namesmodel[i],"[,1:800]*wei_matrix")))
        }
        else{
          eval(parse(text = paste0("result = result + ",dataset,"_",namesmodel[i],"[,1:800]*wei_matrix")))
        }
      }
      if(dataset != "offical_noin800"){
        maxindex = apply(result,1,which.max)
        result$acc = final$word[maxindex]
        eval(parse(text = paste0("result$acc = ifelse(",dataset,"_",namesmodel[i],"$origin_word == final$word[maxindex],1,0)")))
      }
      if(dataset == 'test_data'){
        eval(parse(text = paste0("result$origin_word = ",dataset,"_",namesmodel[i],"$origin_word")))
      }
      new_stat = final
      eval(parse(text = paste0("new_stat$min_prob_new =  ",paste0("new_stat$min_prob_",namesmodel,"*new_stat$wei_",namesmodel,sep = "",collapse = '+'))))
      eval(parse(text = paste0("new_stat$mean_prob_new =  ",paste0("new_stat$mean_prob_",namesmodel,"*new_stat$wei_",namesmodel,sep = "",collapse = '+'))))
      result = list(result,new_stat)
      return(result)
    }  
    

    4.3 输出结果(以CSV档显示)

    • 模型组合权重表(红框处为组合权重)

    • 新机率表(红框处代表是否正确预测,正确预测为1;错误预测为0)

  5. 判断isnull

    5.1 定义function

    # 判断isnull的Function
    get_min01 = function(namesmodel = c(1),stat = 'min_prob',dataset = "offical_in800",new_data = NULL,new_stat = NULL){
      if(is.null(new_data) & is.null(new_stat)){
        for(i in 1:length(namesmodel)){
          if(stat == 'min_prob'){
            eval(parse(text = paste0("tmp = ",dataset,"_",namesmodel[i],"[,1:800]")))
            eval(parse(text = paste0("min_prob_index = final$min_prob_",namesmodel[i])))
            min_01 = apply(tmp,1,FUN = function(x){
              maxindex = which.max(x)
              min_01 = ifelse(x[maxindex] >= min_prob_index[maxindex],0,1)
            })  
          }
          else{  
            eval(parse(text = paste0("tmp = ",dataset,"_",namesmodel[i],"[,1:800]")))
            eval(parse(text = paste0("mean_prob_index = final$mean_prob_",namesmodel[i])))
            min_01 = apply(tmp,1,FUN = function(x){
              maxindex = which.max(x)
              min_01 = ifelse(x[maxindex] >= mean_prob_index[maxindex],0,1)
            })
          }
          if(i == 1){
            result = min_01
          }
          else{
            result = result + min_01
          }
        }
        result = result/length(namesmodel)
        result = ifelse(result >= 0.5,1,0)
      }
      else{   
        if(stat == 'min_prob'){
          tmp = new_data[,1:800]
          min_prob_index = new_stat$min_prob_new
          min_01 = apply(tmp,1,FUN = function(x){
            maxindex = which.max(x)
            min_01 = ifelse(x[maxindex] >= min_prob_index[maxindex],0,1)
          })
        }
        else{
          tmp = new_data[,1:800]
          mean_prob_index = new_stat$mean_prob_new
          min_01 = apply(tmp,1,FUN = function(x){
            maxindex = which.max(x)
            min_01 = ifelse(x[maxindex] >= mean_prob_index[maxindex],0,1)
          })
        }
        result = min_01
      }
      return(result)
    }
    

小结

  1. 今天成功取得阈值、奇数的模型组合的权重表,并定义如何判断isnull的function。
  2. 下一章的目标是:「交叉比对不同的模型组合方法,并选出其中最佳的」。

让我们继续看下去...


<<:  RISC-V: R-type 算术指令

>>:  Day 21- To Do List (8) 利用 HTML Template 呈现资料

菜鸡用 Phaser 拾起童年游戏 29

今天要来感谢很多事情,把所有的感谢都奉上,以及我先前做的程序码,提供给大家参考。 Photo by ...

Day03 - 随意玩之 API 讯息内文以及 Sign

今天预计讲解下面两个 (也就是下图的步骤 5) API 的 JSON 内容 把内容加上 Nonce ...

【从实作学习ASP.NET Core】Day30 | 总结与回顾

完结洒花 当初只是想藉这个机会督促自己学新东西,还真的没想到能够完赛 xD 这是我第一次写技术文章 ...

D08 / 怎麽做自己的 Modifier.padding? - Custom Layout Modifier

今天大概会聊到的范围 layout modifier 上一次讨论到 Modifier 时,觉得自己...

Day 4 - Array

当需要把资料放在一起时,就会需要 Array (阵列)。 小提醒:阵列不是原始资料型别之一。 当有很...