想持久运营一款web或移动端的产品,对内容进行必要的把关必不可少。这里分享一个基于DFA算法的高性能的敏感词,脏词的检测过滤算法类(c#).

【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词..  这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。

 废话少说,先看下代码,可以拿过去直接使用。

using Microsoft.VisualBasic;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;

namespace OpenCore.ContentSecurity
{
    /// <summary>
    /// 功能简介:基于DFA算法的高效率非法关键词检测过滤类(杜绝违法内容)
    /// 开发前参考内容:https://blog.csdn.net/u011966339/article/details/72832197
    /// 更新日志:
    ///          2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能.
    ///                          支持多词库文件加载.
    ///                          优化了算法的细节,提高健壮性。
    /// </summary>
    public class SensitiveWordFilter
    {
        private static string[] dictionaryPathList = null;
        /// <summary>
        /// 内存词典
        /// </summary>
        private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];
        private static object lockObj = new object();
        public static void Init(string[] sDictionaryFileName)
        {
            dictionaryPathList = sDictionaryFileName;
            LoadDictionary();
        }
        public SensitiveWordFilter()
        {

        }
        private string sourctText = string.Empty;
        /// <summary>
        /// 检测源
        /// </summary>
        private string SourctText
        {
            get { return sourctText; }
            set { sourctText = value; }
        }
        /// <summary>
        /// 检测源游标
        /// </summary>
        private int cursor = 0;
        /// <summary>
        /// 匹配成功后偏移量
        /// </summary>
        private int wordlenght = 0;
        /// <summary>
        /// 检测词游标
        /// </summary>
        private int nextCursor = 0;
        private List<string> illegalWords = new List<string>();
        /// <summary>
        /// 检测到的非法词集
        /// </summary>
        public List<string> IllegalWords
        {
            get { return illegalWords; }
        }
        /// <summary>
        /// 判断是否是中文
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isCHS(char character)
        {
            //  中文表意字符的范围 4E00-9FA5
            int charVal = (int)character;
            return (charVal >= 0x4e00 && charVal <= 0x9fa5);
        }
        /// <summary>
        /// 判断是否是数字
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isNum(char character)
        {
            int charVal = (int)character;
            return (charVal >= 48 && charVal <= 57);
        }
        /// <summary>
        /// 判断是否是字母
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isAlphabet(char character)
        {
            int charVal = (int)character;
            return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
        }
        /// <summary>
        /// 转半角小写的函数(DBC case)
        /// </summary>
        /// <param name="input">任意字符串</param>
        /// <returns>半角字符串</returns>
        ///<remarks>
        ///全角空格为12288,半角空格为32
        ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
        ///</remarks>
        private static string ToDBC(string input)
        {
            char[] c = input.ToCharArray();
            for (int i = 0; i < c.Length; i++)
            {
                if (c[i] == 12288)
                {
                    c[i] = (char)32;
                    continue;
                }
                if (c[i] > 65280 && c[i] < 65375)
                    c[i] = (char)(c[i] - 65248);
            }
            return new string(c).ToLower();
        }
        /// <summary>
        /// 转换为简体中文
        /// </summary>
        /// <param name="sInput"></param>
        /// <returns></returns>
        private static string ToSimplifiedChiniese(string sInput)
        {
            if (string.IsNullOrEmpty(sInput))
            {
                return string.Empty;
            }
            try
            {
                return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, 0);
            }
            catch (Exception ex)
            {

            }
            return sInput;
        }
        /// <summary>
        /// 写入日志(非跨程序域的场景)
        /// </summary>
        /// <param name="Msg"></param>
        private static void SaveLog(string Msg)
        {
            string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog");
            if (!Directory.Exists(sPath))
            {
                Directory.CreateDirectory(sPath);
            }
            sPath = string.Format("{0}\\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log");
            try
            {
                File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "\r\n");
            }
            catch
            {
            }
        }
        /// <summary>
        /// 加载内存词库
        /// </summary>
        private static void LoadDictionary()
        {
            if (dictionaryPathList == null || dictionaryPathList.Length == 0)
            {
                SaveLog($"SensitiveWordFilter.LoadDictionary.字典路径配置为空");
                return;
            }
            foreach (string sFileName in dictionaryPathList)
            {
                if (File.Exists(sFileName) == false)
                {
                    SaveLog($"SensitiveWordFilter.LoadDictionary.路径:{sFileName}不是一个有效的文件");
                    return;
                }
            }
            List<string> wordList = new List<string>();
            Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
            foreach (string sDictionaryFile in dictionaryPathList)
            {
                string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default);
                foreach (string word in words)
                {
                    if (string.IsNullOrEmpty(word))
                        continue;
                    if (word.Trim().Length == 0)
                        continue;
                    string key = ToDBC(word);
                    wordList.Add(key);
                    //适配繁体,简体.addbyww@2020-4-15
                    string key_simple = ToSimplifiedChiniese(key);
                    if (key_simple != key)
                    {
                        wordList.Add(key_simple);
                    }
                }
            }
            Comparison<string> cmp = delegate (string key1, string key2)
            {
                return key1.CompareTo(key2);
            };
            wordList.Sort(cmp);
            for (int i = wordList.Count - 1; i > 0; i--)
            {
                if (wordList[i].ToString() == wordList[i - 1].ToString())
                {
                    wordList.RemoveAt(i);
                }
            }
            foreach (var word in wordList)
            {
                if (word.Length > 0)
                {
                    WordGroup group = MEMORYLEXICON[(int)word[0]];
                    if (group == null)
                    {
                        group = new WordGroup();
                        MEMORYLEXICON[(int)word[0]] = group;
                    }
                    group.Add(word.Substring(1));
                }
            }
        }
        /// <summary>
        /// 检测
        /// </summary>
        /// <param name="blackWord"></param>
        /// <returns></returns>
        private bool Check(string blackWord)
        {
            wordlenght = 0;
            //检测源下一位游标
            nextCursor = cursor + 1;
            bool found = false;
            //遍历词的每一位做匹配
            for (int i = 0; i < blackWord.Length; i++)
            {
                //特殊字符偏移游标
                int offset = 0;
                if (nextCursor >= sourctText.Length)
                {
                    break;
                }
                else
                {
                    //检测下位字符如果不是汉字 数字 字符 偏移量加1
                    for (int y = nextCursor; y < sourctText.Length; y++)
                    {

                        if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
                        {
                            offset++;
                            //避让特殊字符,下位游标如果>=字符串长度 跳出
                            if (nextCursor + offset >= sourctText.Length) break;
                            wordlenght++;
                        }
                        else break;
                    }
                    if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
                    {
                        found = true;
                    }
                    else
                    {
                        found = false;
                        break;
                    }
                }
                nextCursor = nextCursor + 1 + offset;
                wordlenght++;
            }
            return found;
        }
        /// <summary>
        /// 检测并替换敏感词为指定字符。之后返回
        /// </summary>
        /// <param name="replaceChar">比如:*</param>
        public string getDataByFilter(string sSourceInput, char replaceChar)
        {
            if (string.IsNullOrEmpty(sSourceInput))
            {
                return sSourceInput;
            }
            if (MEMORYLEXICON == null || MEMORYLEXICON.Length == 0)
            {
                SaveLog($"SensitiveWordFilter.getDataByFilter.内存字典为空");
                return sSourceInput;
            }
            //初始化
            this.cursor = 0;
            this.wordlenght = 0;
            this.illegalWords.Clear();
            this.sourctText = sSourceInput;
            if (sourctText != string.Empty)
            {
                char[] tempString = sourctText.ToCharArray();
                for (int i = 0; i < SourctText.Length; i++)
                {
                    //查询以该字为首字符的词组
                    WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
                    if (group != null)
                    {
                        for (int z = 0; z < group.Count(); z++)
                        {
                            string word = group.GetWord(z);
                            if (word.Length == 0 || Check(word))
                            {
                                string blackword = string.Empty;
                                for (int pos = 0; pos < wordlenght + 1; pos++)
                                {
                                    blackword += tempString[pos + cursor].ToString();
                                    tempString[pos + cursor] = replaceChar;
                                }
                                illegalWords.Add(blackword);
                                cursor = cursor + wordlenght;
                                i = i + wordlenght;
                            }
                        }
                    }
                    cursor++;
                }
                return new string(tempString);
            }
            else
            {
                return string.Empty;
            }
        }
    }
    /// <summary>
    /// 具有相同首字符的词组集合
    /// </summary>
    public class WordGroup
    {
        /// <summary>
        /// 集合
        /// </summary>
        private List<string> groupList=new List<string>();
        public WordGroup()
        {

        }
        /// <summary>
        /// 添加词
        /// </summary>
        /// <param name="word"></param>
        public void Add(string word)
        {
            if (groupList.Contains(word) == false)
            {
                groupList.Add(word);
            }
        }
        /// <summary>
        /// 获取总数
        /// </summary>
        /// <returns></returns>
        public int Count()
        {
            return groupList.Count;
        }
        /// <summary>
        /// 根据下标获取词
        /// </summary>
        /// <param name="index"></param>
        /// <returns></returns>
        public string GetWord(int index)
        {
            return groupList[index];
        }
    }
}

上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:

            //全局配置,整个程序只要配置一次即可,后续无需配置
            SensitiveWordFilter.Init(new string[] {
                 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\暴恐词库.txt",
                 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\反动词库.txt",
                 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\民生词库.txt",
                 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\色情词库.txt",
                 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\贪腐词库.txt",
                 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\其他词库.txt"
            });
            //下列可以在多个地方实例化,可以并发执行
            SensitiveWordFilter wordFilter = new SensitiveWordFilter();
            Dictionary<string, string> dictTestData = new Dictionary<string, string>();
            //多测几个示例,看看效果
            dictTestData["杀^人游戏,有人找一夜q"] = "";
            dictTestData["数学学习课堂"] = "";
            dictTestData["打击法0功有,法0功毒害大众"] = "";
            Dictionary<string, string> dictResult = new Dictionary<string, string>();
            foreach(string sKey in dictTestData.Keys)
            {
                dictResult[sKey] = $"替换后:{wordFilter.getDataByFilter(sKey,'|')},  ------------检测违禁词:{string.Join(",",(wordFilter.IllegalWords==null?new List<string>():wordFilter.IllegalWords))}";
            }
            string sResultJson = JsonConverter.SerializeObject(dictResult);
            Utils.SaveLog(sResultJson);

 最后,给一下打印的结果:

“杀^人游戏,有人找一夜q”:     替换后: “杀^人游戏,有人找|||”, ————检测违禁词:一夜q”,
“数学学习课堂”:     替换后:”数学学习课堂”, ————检测违禁词:,
“打击法0功有,法0功毒害大众”:   替换后:“打击|||有,|||毒害大众”, ————检测违禁词:法0功,法0功”

版权声明:本文为taohuadaozhu原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/taohuadaozhu/p/12707700.html