[2021]余弦定理检测文件相似度 & 病毒样本基因检测

本余弦定理有如下应用场景:
1.相似度计算
2.信息推送

在网络安全领域,主要就是样本基因检测,或者叫做样本相似度计算,他的公式长这样:
![](https://key08.com/usr/uploads/2021/08/2243488000.png)
请注意,之所以叫做余弦定理,是因为,他就是求一个三角形的角,并且在N维这个定理也成立
![](https://key08.com/usr/uploads/2021/08/3451125394.png)

样本相似度检测,以两个风灵月影为例,属于同一个家族:
![](https://key08.com/usr/uploads/2021/08/947620823.png)

# 编码
通过pefile库,读入文件,然后逐个比对字节码,参数A为字节码相同的,参数B为字节码不同的:
```cpp
def get_peinfo_by_cos(pSource,pTarget):
    source = pefile.PE(pSource)
    target = pefile.PE(pTarget)
    source_map,source_sizeof_code,source_base_of_code = get_pe_info(source)
    target_map,target_sizeof_code,target_base_of_code = get_pe_info(target)
    a1_dict = {}
    a2_dict = {}
    for iter in range(source_sizeof_code):
        v1 = iter + source_base_of_code
        v2 = iter + source_base_of_code + 1
        if source_map[v1:v2] in a1_dict.keys():
            a1_dict[source_map[v1:v2]] = a1_dict[source_map[v1:v2]] + 1
        else:
            a1_dict[source_map[v1:v2]] = 0 
    for iter in range(target_sizeof_code):
        v1 = iter + target_base_of_code
        v2 = iter + target_base_of_code + 1
        if target_map[v1:v2] in a2_dict.keys():
            a2_dict[target_map[v1:v2]] = a2_dict[target_map[v1:v2]] + 1
        else:
            a2_dict[target_map[v1:v2]] = 0
    str1_vector=[]
    str2_vector=[]
    for key in a1_dict:
        str1_count = a1_dict[key]
        str1_vector.append(str1_count)
    for key in a2_dict:
        str2_count = a2_dict[key]
        str2_vector.append(str2_count)
    str1_map = map(lambda x: x*x,str1_vector)
    str2_map = map(lambda x: x*x,str2_vector)

str1_mod =  reduce(lambda x, y: x+y, str1_map)
    str2_mod = reduce(lambda x, y: x+y, str2_map)
    str1_mod = math.sqrt(str1_mod)
    str2_mod = math.sqrt(str2_mod)
    vector_multi = reduce(lambda x, y: x + y, map(lambda x, y: x * y, str1_vector, str2_vector))

# 计算余弦值
    cos = float(vector_multi)/(str1_mod*str2_mod)
    return cos

```
其中,两个是相似的,两个是不相似的,两个是恶意样本家族
来试试:
![](https://key08.com/usr/uploads/2021/08/1402924204.png)
简单粗暴,并且有效.
完整代码:

```cpp

import math
import os
import string
import hashlib
import pefile
import datetime
from functools import reduce

def get_pe_info(pPe):
    return (pPe.get_memory_mapped_image(),pPe.OPTIONAL_HEADER.SizeOfCode,pPe.OPTIONAL_HEADER.BaseOfCode)
    
def get_peinfo_by_cos(pSource,pTarget):
    source = pefile.PE(pSource)
    target = pefile.PE(pTarget)
    source_map,source_sizeof_code,source_base_of_code = get_pe_info(source)
    target_map,target_sizeof_code,target_base_of_code = get_pe_info(target)
    
    a1_dict = {}
    a2_dict = {}
    
    for iter in range(source_sizeof_code):
        v1 = iter + source_base_of_code
        v2 = iter + source_base_of_code + 1
        if source_map[v1:v2] in a1_dict.keys():
            a1_dict[source_map[v1:v2]] = a1_dict[source_map[v1:v2]] + 1
        else:
            a1_dict[source_map[v1:v2]] = 0 
    for iter in range(target_sizeof_code):
        v1 = iter + target_base_of_code
        v2 = iter + target_base_of_code + 1
        if target_map[v1:v2] in a2_dict.keys():
            a2_dict[target_map[v1:v2]] = a2_dict[target_map[v1:v2]] + 1
        else:
            a2_dict[target_map[v1:v2]] = 0
 
        
    str1_vector=[]
    str2_vector=[]
    for key in a1_dict:
        str1_count = a1_dict[key]
        str1_vector.append(str1_count)
    for key in a2_dict:
        str2_count = a2_dict[key]
        str2_vector.append(str2_count)
     # 计算各自平方和
    str1_map = map(lambda x: x*x,str1_vector)
    str2_map = map(lambda x: x*x,str2_vector)

str1_mod =  reduce(lambda x, y: x+y, str1_map)
    str2_mod = reduce(lambda x, y: x+y, str2_map)

# 计算平方根
    str1_mod = math.sqrt(str1_mod)
    str2_mod = math.sqrt(str2_mod)

# 计算向量积
    vector_multi = reduce(lambda x, y: x + y, map(lambda x, y: x * y, str1_vector, str2_vector))

# 计算余弦值
    cos = float(vector_multi)/(str1_mod*str2_mod)
    return cos

if __name__ == "__main__":
    input_duck = [{
        "a1": "C:\\Users\\Administrator\\Desktop\\Mount and Blade II Bannerlord Early Access Plus 33 Trainer Updated 2021.06.12.exe",
        "a2": "C:\\Users\\Administrator\\Desktop\\DOOM v1.0 Plus 13 Trainer.exe"
    },{
        "a1": "C:\\Users\\Administrator\\Desktop\\Hasher.exe",
        "a2": "C:\\Users\\Administrator\\Desktop\\pafish.exe"
    },{
        "a1": "Z:\\1.bin",
        "a2": "Z:\\2.bin"
    }]
    for index in range(len(input_duck)):
        print("两个文件相似度为: {}".format(get_peinfo_by_cos(input_duck[index]['a1'],input_duck[index]['a2'])))

```
### 不足与改进
1. 没有考虑到壳的问题
2. 没有考虑到其他的因素比如import table等,这些也是向量丢失的问题
3. 没有考虑到字节码与地址的区别,应该只比较 字节码而要忽略地址[事实上我故意忽略了这一步]

白帽Wiki

一只鸭子

白帽Wiki - 一个简单的wiki