【JS逆向系列】某方数据获取，proto入门

漁滒 · 发表于 2022-3-27 17:24

@TOC

样品网址：aHR0cHM6Ly93d3cud2FuZmFuZ2RhdGEuY29tLmNuL2luZGV4Lmh0bWw=

打开网站后，随便搜索一个关键词，这里以【百度】为例，本次需要分析的是【SearchService.SearchService/search】这个接口

这里可以看到，请求体不再是表单或者json，而是一堆类似乱码的东西，再看看响应体

也是一堆乱码，有的人可能就会想，会不会是有加密呢？当然不排除这个可能。接着再看看请求头，其中有一行是【content-type: application/grpc-web+proto】，这里指明了请求体的类型是proto。

所以这里的乱码并不是有加密，只是用proto这种格式序列化而已。那么接下来的流程就是编写proto文件，然后用protoc编译为对应语言可以调用的类文件。

首先下载protoc，到https://github.com/protocolbuffers/protobuf/下载最新的发行版工具，我这里下载的是win'64版本

下载解压后，将里面的bin目录添加到环境变量，然后在cmd窗口输入【protoc --version】，出现版本好即为成功

因为我们分析的是【SearchService.SearchService/search】这个接口，所以先下一个XHR断点，刷新网页，在断点处断下，返回调用堆栈的上一层

看到一个类似组包的函数，那么在前面加一个断点，再次刷新

接着进入【r.a】

发现这是一个webpack打包的js，并且所有的信息序列化与反序列化的操作都在这个js里面，一般情况下，都是用的标准库的工具，所以首先直接搜索【.deserializeBinaryFromReader = 】，为什么搜索这个呢？这就好比json的数据会搜索【JSON.】是一样的。

这里就获取到每个信息是如何解析的，也就是可以获取信息的结构

如果一个一个来写的话，那就有点麻烦了，而且还怕会出错，那么为了保证准确性，所以这次使用ast来生成proto文件，首先吧这个【app.1d44779a.js】下载到本地，并且执行下面代码


const parser = require("@babel/parser");
// 为parser提供模板引擎
const template = require("@babel/template").default;
// 遍历AST
const traverse = require("@babel/traverse").default;
// 操作节点，比如判断节点类型，生成新的节点等
const t = require("@babel/types");
// 将语法树转换为源代码
const generator = require("@babel/generator");
// 操作文件
const fs = require("fs");

//定义公共函数
function wtofile(path, flags, code) {
    var fd = fs.openSync(path,flags);
    fs.writeSync(fd, code);
    fs.closeSync(fd);
}

function dtofile(path) {
    fs.unlinkSync(path);
}

var file_path = 'app.1d44779a.js';
var jscode = fs.readFileSync(file_path, {
    encoding: "utf-8"
});

// 转换为AST语法树
let ast = parser.parse(jscode);
let proto_text = `syntax = "proto2";\n\n// protoc --python_out=. app_proto2.proto\n\n`;

traverse(ast, {
    MemberExpression(path){
        if(path.node.property.type === 'Identifier' && path.node.property.name === 'deserializeBinaryFromReader' && path.parentPath.type === 'AssignmentExpression'){
            let id_name = path.toString().split('.').slice(1, -1).join('_');
            path.parentPath.traverse({
                VariableDeclaration(path_2){
                    if(path_2.node.declarations.length === 1){
                        path_2.replaceWith(t.expressionStatement(
                            t.assignmentExpression(
                                "=",
                                path_2.node.declarations[0].id,
                                path_2.node.declarations[0].init
                            )
                        ))
                    }
                },
                SwitchStatement(path_2){
                    for (let i = 0; i < path_2.node.cases.length - 1; i++) {
                        let item = path_2.node.cases[i];
                        let item2 = path_2.node.cases[i + 1];
                        if(item.consequent.length === 0 && item2.consequent[1].expression.type === 'SequenceExpression'){
                            item.consequent = [
                                item2.consequent[0],
                                t.expressionStatement(
                                    item2.consequent[1].expression.expressions[0]
                                ),
                                item2.consequent[2]
                            ];
                            item2.consequent[1] = t.expressionStatement(
                                item2.consequent[1].expression.expressions[1]
                            )
                        }else if(item.consequent.length === 0){
                            item.consequent = item2.consequent
                        }else if(item.consequent[1].expression.type === 'SequenceExpression'){
                            item.consequent[1] = t.expressionStatement(
                                item.consequent[1].expression.expressions[1]
                            )
                        }
                    }
                }
            });
            let id_text = 'message ' + id_name + ' {\n';
            let let_id_list = [];
            for (let i = 0; i < path.parentPath.node.right.body.body[0].body.body[2].cases.length; i++) {
                let item = path.parentPath.node.right.body.body[0].body.body[2].cases[i];
                if(item.test){
                    let id_number = item.test.value;
                    let key = item.consequent[1].expression.callee.property.name;
                    let id_st, id_type;
                    if(key.startsWith("set")){
                        id_st = "optional";
                    }else if(key.startsWith("add")){
                        id_st = "repeated";
                    }else{
                        // map类型，因为案例中用不到，所以这里省略
                        continue
                    }
                    key = key.substring(3, key.length);
                    id_type = item.consequent[0];
                    if(id_type.expression.right.type === 'NewExpression'){
                        id_type = generator.default(id_type.expression.right.callee).code.split('.').slice(1).join('_');
                    }else{
                        switch (id_type.expression.right.callee.property.name) {
                            case "readString":
                                id_type = "string";
                                break;
                            case "readDouble":
                                id_type = "double";
                                break;
                            case "readInt32":
                                id_type = "int32";
                                break;
                            case "readInt64":
                                id_type = "int64";
                                break;
                            case "readFloat":
                                id_type = "float";
                                break;
                            case "readBool":
                                id_type = "bool";
                                break;
                            case "readPackedInt32":
                                id_st = "repeated";
                                id_type = "int32";
                                break;
                            case "readBytes":
                                id_type = "bytes";
                                break;
                            case "readEnum":
                                id_type = "readEnum";
                                break;
                            case "readPackedEnum":
                                id_st = "repeated";
                                id_type = "readEnum";
                                break;
                        }
                    }
                    if(id_type === 'readEnum'){
                        id_type = id_name + '_' + key + 'Enum';
                        if(let_id_list.indexOf(id_number) === -1){
                            id_text += '\tenum ' + id_type + ' {\n';
                            for (let j = 0; j < 3; j++) {
                                id_text += '\t\t' + id_type + 'TYPE_' + j + ' = ' + j + ';\n';
                            }
                            id_text += '\t}\n\n';
                            id_text += '\t' + id_st + ' ' + id_type + ' ' + key + ' = ' + id_number + ';\n';
                            let_id_list.push(id_number)
                        }
                    }else{
                        if(let_id_list.indexOf(id_number) === -1){
                            id_text += '\t' + id_st + ' ' + id_type + ' ' + key + ' = ' + id_number + ';\n';
                            let_id_list.push(id_number)
                        }
                    }
                }
            }
            id_text += '}\n\n';
            proto_text += id_text
        }
    }
});

wtofile('app_proto2.proto', 'w', proto_text);

运行后可以得到一个【app_proto2.proto】的文件，开后发现有少量报错

在网页中搜索这个信息结构

这里的o省略了路径名，所以无法获取到完整路径就报错了，手动补充一下即可，往上查找o的来源

o是来自于【e348】，那么搜索这个

一直拉到最下面看看导出的名称是什么

接着补全一下路径

其他的报错都可以如此类推解决，然后在当前目录打开cmd，输入指令编译出python可调用的类

protoc --python_out=. app_proto2.proto

此时就可以在当前目录的一个【app_proto2_pb2.py】文件

尝试使用这个生成的了进行数据序列化，使用proto文件前，需要先安装依赖库

pip install protobuf

但是并不是直接序列化后就可以请求，这里可以看到对请求体还有一层包装，序列化的内容被设置到偏移5的位置，而偏移1的位置设置了【l】参数，这里的【l】参数就是后面数据的长度

那么尝试按照这个格式去生成一个请求体去试试能不能获取数据

import app_proto2_pb2
import requests_html
import struct

def main():
    requests = requests_html.HTMLSession()
    search_request = app_proto2_pb2.SearchService_SearchRequest()
    search_request.InterfaceType = app_proto2_pb2.SearchService_SearchRequest.SearchService_SearchRequest_InterfaceTypeEnum.Value('SearchService_SearchRequest_InterfaceTypeEnumTYPE_0')
    search_request.Commonrequest.SearchType = 'paper'
    search_request.Commonrequest.SearchWord = '百度'
    search_request.Commonrequest.CurrentPage = 1
    search_request.Commonrequest.PageSize = 20
    search_request.Commonrequest.SearchFilterList.append(app_proto2_pb2.SearchService_CommonRequest.SearchService_CommonRequest_SearchFilterListEnum.Value('SearchService_CommonRequest_SearchFilterListEnumTYPE_0'))
    data = search_request.SerializeToString()
    data = bytes([0]) + struct.pack(">i", len(data)) + data
    print(data)

    url = 'https://s.wanfangdata.com.cn/SearchService.SearchService/search'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4901.0 Safari/537.36',
        'Content-Type': 'application/grpc-web+proto',
    }

    response = requests.post(url, headers=headers, data=data)
    print(response.content)
    print(len(response.content))

看起来返回的数据是正确了，那么接着尝试去反序列化数据

没有报错，非常好，说明编写的proto文件没有问题，本文结束，下面是完整代码


import app_proto2_pb2
import requests_html
import struct

def main():
    requests = requests_html.HTMLSession()
    search_request = app_proto2_pb2.SearchService_SearchRequest()
    search_request.InterfaceType = app_proto2_pb2.SearchService_SearchRequest.SearchService_SearchRequest_InterfaceTypeEnum.Value('SearchService_SearchRequest_InterfaceTypeEnumTYPE_0')
    search_request.Commonrequest.SearchType = 'paper'
    search_request.Commonrequest.SearchWord = '百度'
    search_request.Commonrequest.CurrentPage = 1
    search_request.Commonrequest.PageSize = 20
    search_request.Commonrequest.SearchFilterList.append(app_proto2_pb2.SearchService_CommonRequest.SearchService_CommonRequest_SearchFilterListEnum.Value('SearchService_CommonRequest_SearchFilterListEnumTYPE_0'))
    data = search_request.SerializeToString()
    data = bytes([0]) + struct.pack(">i", len(data)) + data
    print(data)

    url = 'https://s.wanfangdata.com.cn/SearchService.SearchService/search'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4901.0 Safari/537.36',
        'Content-Type': 'application/grpc-web+proto',
    }

    response = requests.post(url, headers=headers, data=data)
    data_len = struct.unpack(">i", response.content[1:5])[0]

    search_response = app_proto2_pb2.SearchService_SearchResponse()
    search_response.ParseFromString(response.content[5: 5 + data_len])
    print(search_response)

if __name__ == '__main__':
    main()

附加内容：对于较少的信息结构是，直接手动写也很快。但是多的时候，手动写重复的工作多，还很容易出错，ast的作用就体现出来了。对于web端可以proto文件自动还原可以使用ast，而在app的话，那该如何解决呢？可以参考下面文章使用frida解决

https://github.com/SeeFlowerX/frida-protobuf

unmask · 发表于 2022-3-27 20:27

一般情况下，都是用的标准库的工具，所以首先直接搜索【.deserializeBinaryFromReader = 】，为什么搜索这个呢？这就好比json的数据会搜索【JSON.】是一样的。

用的标准库工具就是搜索【.deserializeBinaryFromReader = 】，这个还是很勉强，用的哪个标准库？每个标准库都是这个funcName？这个不一定吧。
根据我自己的经验，顶多会搜索【deserialize/unserialize】,可能会瞎猫碰死耗子刚好找到了这个deserializeBinaryFromReader，所以这中间还是缺很多东西的，像我这种小白还是看不明白。
但是大佬既然是科普，希望能再补一补这种缺失的东西。

oyfj · 发表于 2022-4-8 17:18

跟着楼主的干货研究了好几天，总算搞懂了。因为没学过python，也看不懂struct.pack和unpack.... 所以写了个nodejs版的...

[JavaScript] 纯文本查看 复制代码

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

const proto = require('./app_proto2_pb')
 
var request = require('request');
 
const searchreq = new proto.SearchService_SearchRequest()
const commonreq = new proto.SearchService_CommonRequest();
// data.setInterfacetype (proto.SearchService_SearchRequest.SearchService_SearchRequest_InterfaceTypeEnum.SEARCHSERVICE_SEARCHREQUEST_INTERFACETYPEENUMTYPE_0)
commonreq.setSearchtype('paper')
commonreq.setSearchword('百度')
commonreq.setCurrentpage(1)
commonreq.setPagesize(20)
// commonreq.setSearchfilterlistList ( [proto.SearchService_CommonRequest.SearchService_CommonRequest_SearchFilterListEnum.SearchService_CommonRequest_SearchFilterListEnumTYPE_0])
 
searchreq.setCommonrequest(commonreq)
var data = searchreq.serializeBinary()
var a = new Uint8Array(5 + data.length)
a.set(new Uint8Array([0, 0, 0, 0, data.length]), 0)
a.set(data, 5)
// console.log('body',data)
// console.log('wrap',a)
// 0,0,0,0,长度,proto2进制包
//前5位其实是代表长度,不满5位就补0
request.post(
    {
        url: 'https://s.wanfangdata.com.cn/SearchService.SearchService/search',
        encoding: null,// nodejs如果不指定编码会默认把返回值转为字符串，会把数据搞错乱
        headers: {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4901.0 Safari/537.36',
            'Content-Type': 'application/grpc-web+proto',
        },
        body: a
 
    }, function (_, httpResponse, body) {
        if (httpResponse.statusCode != 200) {
            console.error('请求出错', body.toString())
            return
        }
        // 返回的是buffer类型
        console.log('body size', body.length);
        var length = parseInt(body.slice(0, 5).toString('hex'), 16)
        console.log('data length', length);
        var result = body.slice(5, 5 + length)
        var obj = proto.SearchService_SearchResponse.deserializeBinary(result)
        console.log(obj.toObject());
    })

大概讲一下组包和解包的思路：
组包：前5位是代表request body被proto序列化后得到uint8array的长度，第6位开始就是序列化后的uint8array了。
解包：截取前5位转成16进制后，再转10进制就得到了长度了，然后就可以截取真实的包了。（Buffer里是10进制的）
那个ast生成proto的代码如果是我来写我估计要调试好几天才写得出来。。膜拜大神

逗逗苍穹 · 发表于 2022-3-27 17:30

大佬，学习了，学习了。

13248101888 · 发表于 2022-3-27 17:33

我看限制不了

kdkdkdkd · 发表于 2022-3-27 18:37

感谢大佬，互相学习互相进步

ciker_li · 发表于 2022-3-27 19:51

好难啊，学习学习

muyejianghu · 发表于 2022-3-27 20:14

哇偶，好复杂的样子

tukuai88ya · 发表于 2022-3-27 20:39

好难啊，学习学习

漁滒 · 发表于 2022-3-27 20:39

unmask 发表于 2022-3-27 20:27
用的标准库工具就是搜索【.deserializeBinaryFromReader = 】，这个还是很勉强，用的哪个标准库？每个 ...

目前我遇到的格式基本是两种，这个是其中一种。还有一种类似于某音的，是路径后面【.decode】，所以基本搜索这两种就可以了，也算是一种经验吧

yxpp · 发表于 2022-3-27 21:21

看大佬知识，长见识

帐号		自动登录	找回密码
密码			注册[Register]

[Web逆向] 【JS逆向系列】某方数据获取，proto入门

免费评分

免费评分

免费评分