当前位置：文江博客话题详情

找出真实的文件类型

发布于 2024-07-11 19:21:06 字数 255 浏览 18 评论 0原文

我正在开发一个处理文件上传的 ASP 网页。仅允许上传某些类型的文件，例如 .XLS、.XML、.CSV、.TXT、.PDF、.PPT 等。

我必须确定文件是否确实具有与扩展名显示的类型相同的类型。换句话说，如果trojan.exe被重命名为harmless.pdf并上传，应用程序必须能够发现上传的文件不是.PDF文件。

您将使用什么技术来分析这些上传的文件？我在哪里可以获得有关这些文件格式的最佳信息？

原文

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

怀中猫帐中妖 2024-07-18 19:21:07

以下 C++ 代码可以帮助您：

//-1 : File Does not Exist or no access
//0 : not an office document
//1 : (General) MS office 2007
//2 : (General) MS office older than 2007
//3 : MS office 2003 PowerPoint presentation
//4 : MS office 2003 Excel spreadsheet
//5 : MS office applications or others 
int IsOffice2007OrOlder(wchar_t * fileName)
{
    int iRet = 0;
    byte msgFormatChk2007[8]    = {0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x06, 0x00};     //offset 0 for office 2007 documents
    byte possibleMSOldOffice[8] = {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1};     //offset 0 for possible office 2003 documents

    byte msgFormatChkXLSPPT[4]  = {0xFD, 0xFF, 0xFF, 0xFF};     // offset 512: xls, ppt: FD FF FF FF 
    byte msgFormatChkOnlyPPT[4] = {0x00, 0x6E, 0x1E, 0xF0};     // offset 512: another ppt offset PPT   
    byte msgFormatChkOnlyDOC[4] = {0xEC, 0xA5, 0xC1, 0x00};     //offset 512: EC A5 C1 00 
    byte msgFormatChkOnlyXLS[8] = {0x09, 0x08, 0x10, 0x00, 0x00, 0x06, 0x05, 0x00};     //offset 512: XLS

    int iMsgChk = 0;
    HANDLE fileHandle = CreateFile(fileName, GENERIC_READ,
        FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL  );
    if(INVALID_HANDLE_VALUE == fileHandle) 
    { 
        return -1; 
    }

    byte buff[20];
    DWORD bytesRead;
    iMsgChk = 1;
    if(0 == ReadFile(fileHandle, buff, 8, &bytesRead, NULL )) 
    { 
        return -1; 
    }

    if(buff[0] == msgFormatChk2007[0]) 
    {
        while(buff[iMsgChk] == msgFormatChk2007[iMsgChk] && iMsgChk < 9)
            iMsgChk++;

        if(iMsgChk >= 8) {  
            iRet = 1; //office 2007 file format
        }
    } 
    else if(buff[0] == possibleMSOldOffice[0])
    {
        while(buff[iMsgChk] == possibleMSOldOffice[iMsgChk] && iMsgChk < 9)
            iMsgChk++;

        if(iMsgChk >= 8)
        {   
            //old office file format, check 512 offset further in order to filter out real office format
            iMsgChk = 1;
            SetFilePointer(fileHandle, 512, NULL, FILE_BEGIN);
            if(ReadFile(fileHandle, buff, 8, &bytesRead, NULL ) == 0) { return 0; }

            if(buff[0] == msgFormatChkXLSPPT[0])
            {
                while(buff[iMsgChk] == msgFormatChkXLSPPT[iMsgChk] && iMsgChk < 5)
                    iMsgChk++;

                if(iMsgChk == 4)
                    iRet = 2;
            }
            else if(buff[iMsgChk] == msgFormatChkOnlyDOC[iMsgChk])
            {
                while(buff[iMsgChk] == msgFormatChkOnlyDOC[iMsgChk] && iMsgChk < 5)
                    iMsgChk++;
                if(iMsgChk == 4)
                    iRet = 2;

            }
            else if(buff[0] == msgFormatChkOnlyPPT[0])
            {
                while(buff[iMsgChk] == msgFormatChkOnlyPPT[iMsgChk] && iMsgChk < 5)
                    iMsgChk++;

                if(iMsgChk == 4)
                    iRet = 3;
            }
            else if(buff[0] == msgFormatChkOnlyXLS[0])
            {

                while(buff[iMsgChk] == msgFormatChkOnlyXLS[iMsgChk] && iMsgChk < 9)
                    iMsgChk++;

                if(iMsgChk == 9)
                    iRet = 4;
            } 

            if(0 == iRet){
                iRet = 5;
            }
        }
    }


    CloseHandle(fileHandle);

    return iRet;
}

The following C++ code could help you:

//-1 : File Does not Exist or no access
//0 : not an office document
//1 : (General) MS office 2007
//2 : (General) MS office older than 2007
//3 : MS office 2003 PowerPoint presentation
//4 : MS office 2003 Excel spreadsheet
//5 : MS office applications or others 
int IsOffice2007OrOlder(wchar_t * fileName)
{
    int iRet = 0;
    byte msgFormatChk2007[8]    = {0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x06, 0x00};     //offset 0 for office 2007 documents
    byte possibleMSOldOffice[8] = {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1};     //offset 0 for possible office 2003 documents

    byte msgFormatChkXLSPPT[4]  = {0xFD, 0xFF, 0xFF, 0xFF};     // offset 512: xls, ppt: FD FF FF FF 
    byte msgFormatChkOnlyPPT[4] = {0x00, 0x6E, 0x1E, 0xF0};     // offset 512: another ppt offset PPT   
    byte msgFormatChkOnlyDOC[4] = {0xEC, 0xA5, 0xC1, 0x00};     //offset 512: EC A5 C1 00 
    byte msgFormatChkOnlyXLS[8] = {0x09, 0x08, 0x10, 0x00, 0x00, 0x06, 0x05, 0x00};     //offset 512: XLS

    int iMsgChk = 0;
    HANDLE fileHandle = CreateFile(fileName, GENERIC_READ,
        FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL  );
    if(INVALID_HANDLE_VALUE == fileHandle) 
    { 
        return -1; 
    }

    byte buff[20];
    DWORD bytesRead;
    iMsgChk = 1;
    if(0 == ReadFile(fileHandle, buff, 8, &bytesRead, NULL )) 
    { 
        return -1; 
    }

    if(buff[0] == msgFormatChk2007[0]) 
    {
        while(buff[iMsgChk] == msgFormatChk2007[iMsgChk] && iMsgChk < 9)
            iMsgChk++;

        if(iMsgChk >= 8) {  
            iRet = 1; //office 2007 file format
        }
    } 
    else if(buff[0] == possibleMSOldOffice[0])
    {
        while(buff[iMsgChk] == possibleMSOldOffice[iMsgChk] && iMsgChk < 9)
            iMsgChk++;

        if(iMsgChk >= 8)
        {   
            //old office file format, check 512 offset further in order to filter out real office format
            iMsgChk = 1;
            SetFilePointer(fileHandle, 512, NULL, FILE_BEGIN);
            if(ReadFile(fileHandle, buff, 8, &bytesRead, NULL ) == 0) { return 0; }

            if(buff[0] == msgFormatChkXLSPPT[0])
            {
                while(buff[iMsgChk] == msgFormatChkXLSPPT[iMsgChk] && iMsgChk < 5)
                    iMsgChk++;

                if(iMsgChk == 4)
                    iRet = 2;
            }
            else if(buff[iMsgChk] == msgFormatChkOnlyDOC[iMsgChk])
            {
                while(buff[iMsgChk] == msgFormatChkOnlyDOC[iMsgChk] && iMsgChk < 5)
                    iMsgChk++;
                if(iMsgChk == 4)
                    iRet = 2;

            }
            else if(buff[0] == msgFormatChkOnlyPPT[0])
            {
                while(buff[iMsgChk] == msgFormatChkOnlyPPT[iMsgChk] && iMsgChk < 5)
                    iMsgChk++;

                if(iMsgChk == 4)
                    iRet = 3;
            }
            else if(buff[0] == msgFormatChkOnlyXLS[0])
            {

                while(buff[iMsgChk] == msgFormatChkOnlyXLS[iMsgChk] && iMsgChk < 9)
                    iMsgChk++;

                if(iMsgChk == 9)
                    iRet = 4;
            } 

            if(0 == iRet){
                iRet = 5;
            }
        }
    }


    CloseHandle(fileHandle);

    return iRet;
}

回复收藏 0 原文

梦屿孤独相伴 2024-07-18 19:21:06

一种方法是检查文件中的某些签名或幻数。此页面有一个已知文件签名的方便列表，并且似乎是最新的：

http://www .garykessler.net/library/file_sigs.html

回复收藏 0 原文

深爱不及久伴 2024-07-18 19:21:06

换句话说，如果 trojan.exe 被重命名为 Harmless.pdf 并上传，应用程序必须能够发现上传的文件不是 .PDF 文件。

这并不是什么问题。如果 .exe 作为 .pdf 上传，并且您正确地将其作为应用程序/pdf 返回给下载器，那么下载器得到的将是一个损坏的 PDF。他们必须手动将其重新输入为 .exe 才会受到伤害。

真正的问题是：

某些浏览器可能会嗅探文件的内容，并认为它们比您更了解文件的类型。 IE 在这方面尤其糟糕，如果它发现任何 HTML 标签潜伏在文件开头附近，它倾向于将文件呈现为 HTML。这是特别没有帮助的，因为这意味着脚本可以被注入到您的网站上，可能会损害任何应用程序级别的安全性（cookie 窃取等）。解决方法包括始终使用 Content-Disposition 将文件作为附件提供，和/或从不同的主机名提供文件，这样它就无法跨站点脚本返回到您的主站点。
PDF 文件无论如何都不安全！它们可能充满了脚本，并且存在严重的安全漏洞。利用 PDF 阅读器浏览器插件中的漏洞是目前在网络上安装木马的最常见方法之一。而且您通常几乎无法尝试检测漏洞，因为它们可能被高度混淆。

回复收藏 0 原文

撩人痒 2024-07-18 19:21:06

获取“安全”文件类型的文件头 - 可执行文件始终具有自己的头类型，您可能可以检测到它们。但是，您必须熟悉您打算接受的每种格式。

回复收藏 0 原文

梦巷 2024-07-18 19:21:06

我知道你说的是 C#，但这也许可以移植。此外，它还有一个 XML 文件，其中包含许多常见文件类型的描述符。

它是一个名为 JMimeMagic 的 Java 库。它在这里：http://jmimemagic.sourceforge.net/

回复收藏 0 原文

幽梦紫曦～ 2024-07-18 19:21:06

在 **NIX* 系统上，我们有一个名为 file(1) 的实用程序。尝试为 Windows 找到类似的东西，但文件实用程序 if self 已被移植。

回复收藏 0 原文

时间海 2024-07-18 19:21:06

也许你可以从不同的方向来解决这个问题。为什么不通过病毒扫描程序运行所有上传，而不是识别所有上传的文件类型（对于我来说，仅 Excel 就显得一团糟，因为它现在有多种格式）？多种文件可能包含病毒和木马。这可能会给您的服务器带来更多工作量，但这是最安全的解决方案。

然后由用户来正确识别他们的文件类型，这似乎是合理的。添加大量代码（也需要进行测试）只是为了仔细检查您的用户，这似乎是一个很大的进步。如果我说它是 .pdf2 文件，您会将其重命名为 .pdf 吗？如果这是在公司环境中，那么期望用户的文件具有正确的扩展名是合理的。我还会跟踪谁上传了哪些内容。如果它是公共的，那么扫描文件类型可能是值得的，但我绝对也会进行病毒扫描。

回复收藏 0 原文

~没有更多了~