吾爱破解 - LCG - LSG |安卓破解|病毒分析|www.52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 652|回复: 4
收起左侧

[C&C++ 转载] 判断文件编码(UTF8,8BOM,16LE,16BE,ANSI)附string互转wstring、String互转Wchar_...

  [复制链接]
2370177068 发表于 2022-12-4 00:34
本帖最后由 2370177068 于 2023-12-24 18:41 编辑

至于是不是原创,我自己也不知道怎么算,很多都是ctrlCV,我拼凑的,能用就行。
前面是字符转换,split是str分割成vector,get_last_error是获取错误文本

代码写得比较屎,你们用之前写个控制台测试一下吧


[C++] 纯文本查看 复制代码
#pragma once
#ifndef MYTOOL_H
#define MYTOOL_H

#include <stdio.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <atlconv.h>

class MYTOOL {
public:
    MYTOOL() = default;
    ~MYTOOL() = default;
    static std::wstring string2wstring(std::string str);
    static std::string wstring2string(std::wstring wstr);
    static std::vector<std::string> split(std::string str, std::string pattern);
    static void Wchar_tToString(std::string& szDst, wchar_t* wchar);
    static wchar_t* StringToWchar_t(std::string& str);
    static std::string get_last_error(DWORD errCode = GetLastError());
    static int IsUTF8(const void* pBuffer, long size);
    static int CalculateFileEncoding(LPCSTR filePath, std::string& str);
    static int CalculateFileEncodingW(LPCWSTR filePath, std::string& str);
    static std::string 判断文件编码(LPCSTR filePath, std::string& 读到的文本);
    static std::string 判断文件编码W(LPCWSTR filePath, std::string& 读到的文本);
    static std::string 读取文件UTF8(std::string file);
    static std::string 读取文件UTF8W(const wchar_t* file);
private:

};

std::string MYTOOL::读取文件UTF8(std::string file) {
    FILE* fp;
    auto err = _wfopen_s(&fp, string2wstring(file).c_str(), L"r,ccs=UTF-8");
    if (fp == NULL) {
        return "";
    }
    if (err != 0) {
        return "";
    }
    std::string sum;
    wchar_t str[1024] = { 0 };
    while (fgetws(str, 1024, fp) != NULL) {
        std::string 当前行内容;
        Wchar_tToString(当前行内容, str);
        sum += 当前行内容;
    }
    fclose(fp);
    return sum;
}

std::string MYTOOL::读取文件UTF8W(const wchar_t* file) {
    FILE* fp;
    auto err = _wfopen_s(&fp, file, L"r,ccs=UTF-8");
    if (fp == NULL) {
        return "";
    }
    if (err != 0) {
        return "";
    }
    std::string sum;
    wchar_t str[1024] = { 0 };
    while (fgetws(str, 1024, fp) != NULL) {
        std::string 当前行内容;
        Wchar_tToString(当前行内容, str);
        sum += 当前行内容;
    }
    fclose(fp);
    return sum;
}

std::wstring MYTOOL::string2wstring(std::string str) {
    std::wstring result;
    int len = MultiByteToWideChar(CP_ACP, 0, str.c_str(), (int)str.size(), NULL, 0);
    wchar_t* buffer = new wchar_t[static_cast<size_t>(len) + (size_t)1];
    MultiByteToWideChar(CP_ACP, 0, str.c_str(), (int)str.size(), buffer, len);
    buffer[len] = '\0';
    result.append(buffer);
    delete[] buffer;
    return result;
}

std::string MYTOOL::wstring2string(std::wstring wstr) {
    std::string result;
    int len = WideCharToMultiByte(CP_ACP, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
    char* buffer = new char[static_cast<size_t>(len) + (size_t)1];
    WideCharToMultiByte(CP_ACP, 0, wstr.c_str(), (int)wstr.size(), buffer, len, NULL, NULL);
    buffer[len] = '\0';
    result.append(buffer);
    delete[] buffer;
    return result;
}

std::vector<std::string> MYTOOL::split(std::string str, std::string pattern) {
    int pos;
    std::vector<std::string> result;
    str += pattern;
    int size = (int)str.size();
    for (int i = 0; i < size; i++) {
        pos = (int)str.find(pattern, i);
        if (pos < size) {
            std::string s = str.substr(i, static_cast<std::basic_string<char, std::char_traits<char>, std::allocator<char>>::size_type>(pos) - i);
            result.push_back(s);
            i = pos + (int)pattern.size() - 1;
        }
    }
    return result;
}

void MYTOOL::Wchar_tToString(std::string& szDst, wchar_t* wchar) {
    wchar_t* wText = wchar;
    DWORD dwNum = WideCharToMultiByte(CP_OEMCP, NULL, wText, -1, NULL, 0, NULL, FALSE);
    char* psText;
    psText = new char[dwNum];
    WideCharToMultiByte(CP_ACP, NULL, wText, -1, psText, dwNum, NULL, FALSE);
    szDst = psText;
    delete[]psText;
}

wchar_t* MYTOOL::StringToWchar_t(std::string& str) {
    wchar_t* text1 = new wchar_t[str.size() + 1];
    swprintf(text1, str.size() + 1, L"%S ", str.c_str());
    return text1;
}

std::string MYTOOL::get_last_error(DWORD errCode) {
    std::string err("");
    if (errCode == 0) errCode = GetLastError();
    LPTSTR lpBuffer = NULL;
    //失败
    if (0 == FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
        NULL, errCode, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpBuffer, 0, NULL)) {
        char tmp[100] = { 0 };
        sprintf_s(tmp, "{未定义错误描述(%d)}", errCode);
        err = tmp;
    } else {
        //成功
        USES_CONVERSION;
        if (lpBuffer != NULL) err = wstring2string(lpBuffer);
        LocalFree(lpBuffer);
    }
    return err;
}

int MYTOOL::IsUTF8(const void* pBuffer, long size) {
    int IsUTF8 = 1;
    unsigned char* start = (unsigned char*)pBuffer;
    unsigned char* end = (unsigned char*)pBuffer + size;
    while (start < end) {
        if (*start < 0x80) {
            start++;
        } else if (*start < (0xC0)) {
            IsUTF8 = 0; break;
        } else if (*start < (0xE0)) {
            if (start >= end - 1) break;
            if ((start[1] & (0xC0)) != 0x80) { IsUTF8 = 0; break; }
            start += 2;
        } else if (*start < (0xF0)) {
            if (start >= end - 2) break;
            if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) { IsUTF8 = 0; break; }
            start += 3;
        } else { IsUTF8 = 0; break; }
    }
    return IsUTF8;
}

int MYTOOL::CalculateFileEncoding(LPCSTR filePath, std::string& str) {
    /*  返回值说明
    * 0     文件读取失败
    * 1     UTF-8
    * 2     UTF-16LE
    * 3     UTF16_BE
    * 4     UTF8_BOM
    * 5     未知
    */
    HANDLE pFile; char* tmpBuf;
    DWORD fileSize, dwBytesRead, dwBytesToRead;
    pFile = CreateFileA(filePath, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
    if (pFile == INVALID_HANDLE_VALUE) { CloseHandle(pFile); return 0; }
    fileSize = GetFileSize(pFile, NULL);
    char* buffer = new char[(size_t)fileSize + (size_t)1];
    //buffer = (char*)malloc(static_cast<size_t>(fileSize) + (size_t)1);
    //if (buffer == NULL) { return 0; }
    ZeroMemory(buffer, (size_t)fileSize + (size_t)1);
    dwBytesToRead = fileSize;
    dwBytesRead = 0;
    tmpBuf = buffer;
    do {
        BOOL success = ReadFile(pFile, tmpBuf, dwBytesToRead, &dwBytesRead, NULL);
        if (success == NULL) printf("ReadFile failed : %s", get_last_error().c_str());
        if (dwBytesRead == 0) break;
        dwBytesToRead -= dwBytesRead;
        tmpBuf += dwBytesRead;
    } while (dwBytesToRead > 0);
    CloseHandle(pFile);
    // 处理读到的数据 buffer
    //puts(buffer);
    //std::cout << "buffer0:" << (int)buffer[0] << std::endl;
    //std::cout << "buffer1:" << (int)buffer[1] << std::endl;
    //std::cout << "buffer2:" << (int)buffer[2] << std::endl;
    str.clear();
    str += buffer;
    if (buffer[0] == 0xFF && buffer[1] == 0xFE) {
        return 2;//UTF16_LE
    } else if (buffer[0] == -1 && buffer[1] == -2) {
        return 2;//UTF16_LE
    } else if (buffer[0] == 0xFE && buffer[1] == 0xFF) {
        return 3;//UTF16_BE
    } else if (buffer[0] == -2 && buffer[1] == -1) {
        return 3;//UTF16_BE
    } else if (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) {
        return 4;//UTF8_BOM
    } else if (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65) {
        return 4;//UTF8_BOM
    } else if (IsUTF8(buffer, fileSize + 1)) {
        return 1;//UTF-8
    } else {
        return 5;//以上都不是,可能是ANSI
    }
}

int MYTOOL::CalculateFileEncodingW(LPCWSTR filePath, std::string& str) {
    /*  返回值说明
    * 0     文件读取失败
    * 1     UTF-8
    * 2     UTF-16LE
    * 3     UTF16_BE
    * 4     UTF8_BOM
    * 5     未知
    */
    HANDLE pFile; char* tmpBuf;
    DWORD fileSize, dwBytesRead, dwBytesToRead;
    pFile = CreateFileW(filePath, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
    if (pFile == INVALID_HANDLE_VALUE) { CloseHandle(pFile); return 0; }
    fileSize = GetFileSize(pFile, NULL);
    char* buffer = new char[static_cast<size_t>(fileSize) + (size_t)1];
    buffer = (char*)malloc(static_cast<size_t>(fileSize) + (size_t)1);
    if (buffer == NULL) { return 0; }
    ZeroMemory(buffer, static_cast<size_t>(fileSize) + (size_t)1);
    dwBytesToRead = fileSize;
    dwBytesRead = 0;
    tmpBuf = buffer;
    do {
        BOOL success = ReadFile(pFile, tmpBuf, dwBytesToRead, &dwBytesRead, NULL);
        if (success == NULL) printf("ReadFile failed : %s", get_last_error().c_str());
        if (dwBytesRead == 0) break;
        dwBytesToRead -= dwBytesRead;
        tmpBuf += dwBytesRead;
    } while (dwBytesToRead > 0);
    CloseHandle(pFile);
    // 处理读到的数据 buffer
    //puts(buffer);
    //std::cout << "buffer0:" << (int)buffer[0] << std::endl;
    //std::cout << "buffer1:" << (int)buffer[1] << std::endl;
    //std::cout << "buffer2:" << (int)buffer[2] << std::endl;
    str.clear();
    str += buffer;
    if (buffer[0] == 0xFF && buffer[1] == 0xFE) {
        return 2;//UTF16_LE
    } else if (buffer[0] == -1 && buffer[1] == -2) {//我自己调试了,FF是255,但是实际读到的是-1,所以才加了这么几行-1 -2 -17的
        return 2;//UTF16_LE
    } else if (buffer[0] == 0xFE && buffer[1] == 0xFF) {
        return 3;//UTF16_BE
    } else if (buffer[0] == -2 && buffer[1] == -1) {
        return 3;//UTF16_BE
    } else if (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) {
        return 4;//UTF8_BOM
    } else if (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65) {
        return 4;//UTF8_BOM
    } else if (IsUTF8(buffer, fileSize + 1)) {
        return 1;//UTF-8
    } else {
        return 5;//以上都不是,可能是ANSI
    }
}

std::string MYTOOL::判断文件编码(LPCSTR filePath, std::string& 读到的文本) {
    /*  返回值说明
    * 0     文件读取失败
    * 1     UTF-8
    * 2     UTF-16LE
    * 3     UTF16_BE
    * 4     UTF8_BOM
    * 5     未知
    */
    int ret = CalculateFileEncoding(filePath, 读到的文本);
    switch (ret) {
    case 0:
        return "READ_FAIL";
        break;
    case 1:
        return "UTF-8";
        break;
    case 2:
        return "UTF-16LE_BOM";
        break;
    case 3:
        return "UTF16_BE_BOM";
        break;
    case 4:
        return "UTF8_BOM";
        break;
    case 5://以上都不是,可能是ANSI
        return "ANSI";
        break;
    default:
        return "ERROR";
        break;
    }
}

std::string MYTOOL::判断文件编码W(LPCWSTR filePath, std::string& 读到的文本) {
    /*  返回值说明
    * 0     文件读取失败
    * 1     UTF-8
    * 2     UTF-16LE
    * 3     UTF16_BE
    * 4     UTF8_BOM
    * 5     未知
    */
    int ret = CalculateFileEncodingW(filePath, 读到的文本);
    switch (ret) {
    case 0:
        return "READ_FAIL";
        break;
    case 1:
        return "UTF-8";
        break;
    case 2:
        return "UTF-16LE_BOM";
        break;
    case 3:
        return "UTF16_BE_BOM";
        break;
    case 4:
        return "UTF8_BOM";
        break;
    case 5://以上都不是,可能是ANSI
        return "ANSI";
        break;
    default:
        return "ERROR";
        break;
    }
}

#endif

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

銀鈅 发表于 2023-6-3 12:34
谢谢楼主分享
Burpcka 发表于 2022-12-4 05:12
8970665 发表于 2022-12-4 09:39
sbwfnhn 发表于 2022-12-6 09:46
先收藏,以后转化成其它语言,可能用的上。
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则 警告:本版块禁止灌水或回复与主题无关内容,违者重罚!

快速回复 收藏帖子 返回列表 搜索

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-4-30 00:42

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表