web系统安全运营之基础- 基于DFA算法的高性能的敏感词,脏词的检测过滤算法类(c#).
【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词.. 这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。
废话少说,先看下代码,可以拿过去直接使用。
using Microsoft.VisualBasic; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; namespace OpenCore.ContentSecurity { /// <summary> /// 功能简介:基于DFA算法的高效率非法关键词检测过滤类(杜绝违法内容) /// 开发前参考内容:https://blog.csdn.net/u011966339/article/details/72832197 /// 更新日志: /// 2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能. /// 支持多词库文件加载. /// 优化了算法的细节,提高健壮性。 /// </summary> public class SensitiveWordFilter { private static string[] dictionaryPathList = null; /// <summary> /// 内存词典 /// </summary> private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue]; private static object lockObj = new object(); public static void Init(string[] sDictionaryFileName) { dictionaryPathList = sDictionaryFileName; LoadDictionary(); } public SensitiveWordFilter() { } private string sourctText = string.Empty; /// <summary> /// 检测源 /// </summary> private string SourctText { get { return sourctText; } set { sourctText = value; } } /// <summary> /// 检测源游标 /// </summary> private int cursor = 0; /// <summary> /// 匹配成功后偏移量 /// </summary> private int wordlenght = 0; /// <summary> /// 检测词游标 /// </summary> private int nextCursor = 0; private List<string> illegalWords = new List<string>(); /// <summary> /// 检测到的非法词集 /// </summary> public List<string> IllegalWords { get { return illegalWords; } } /// <summary> /// 判断是否是中文 /// </summary> /// <param name="character"></param> /// <returns></returns> private bool isCHS(char character) { // 中文表意字符的范围 4E00-9FA5 int charVal = (int)character; return (charVal >= 0x4e00 && charVal <= 0x9fa5); } /// <summary> /// 判断是否是数字 /// </summary> /// <param name="character"></param> /// <returns></returns> private bool isNum(char character) { int charVal = (int)character; return (charVal >= 48 && charVal <= 57); } /// <summary> /// 判断是否是字母 /// </summary> /// <param name="character"></param> /// <returns></returns> private bool isAlphabet(char character) { int charVal = (int)character; return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90)); } /// <summary> /// 转半角小写的函数(DBC case) /// </summary> /// <param name="input">任意字符串</param> /// <returns>半角字符串</returns> ///<remarks> ///全角空格为12288,半角空格为32 ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248 ///</remarks> private static string ToDBC(string input) { char[] c = input.ToCharArray(); for (int i = 0; i < c.Length; i++) { if (c[i] == 12288) { c[i] = (char)32; continue; } if (c[i] > 65280 && c[i] < 65375) c[i] = (char)(c[i] - 65248); } return new string(c).ToLower(); } /// <summary> /// 转换为简体中文 /// </summary> /// <param name="sInput"></param> /// <returns></returns> private static string ToSimplifiedChiniese(string sInput) { if (string.IsNullOrEmpty(sInput)) { return string.Empty; } try { return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, 0); } catch (Exception ex) { } return sInput; } /// <summary> /// 写入日志(非跨程序域的场景) /// </summary> /// <param name="Msg"></param> private static void SaveLog(string Msg) { string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog"); if (!Directory.Exists(sPath)) { Directory.CreateDirectory(sPath); } sPath = string.Format("{0}\\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log"); try { File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "\r\n"); } catch { } } /// <summary> /// 加载内存词库 /// </summary> private static void LoadDictionary() { if (dictionaryPathList == null || dictionaryPathList.Length == 0) { SaveLog($"SensitiveWordFilter.LoadDictionary.字典路径配置为空"); return; } foreach (string sFileName in dictionaryPathList) { if (File.Exists(sFileName) == false) { SaveLog($"SensitiveWordFilter.LoadDictionary.路径:{sFileName}不是一个有效的文件"); return; } } List<string> wordList = new List<string>(); Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length); foreach (string sDictionaryFile in dictionaryPathList) { string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default); foreach (string word in words) { if (string.IsNullOrEmpty(word)) continue; if (word.Trim().Length == 0) continue; string key = ToDBC(word); wordList.Add(key); //适配繁体,简体.addbyww@2020-4-15 string key_simple = ToSimplifiedChiniese(key); if (key_simple != key) { wordList.Add(key_simple); } } } Comparison<string> cmp = delegate (string key1, string key2) { return key1.CompareTo(key2); }; wordList.Sort(cmp); for (int i = wordList.Count - 1; i > 0; i--) { if (wordList[i].ToString() == wordList[i - 1].ToString()) { wordList.RemoveAt(i); } } foreach (var word in wordList) { if (word.Length > 0) { WordGroup group = MEMORYLEXICON[(int)word[0]]; if (group == null) { group = new WordGroup(); MEMORYLEXICON[(int)word[0]] = group; } group.Add(word.Substring(1)); } } } /// <summary> /// 检测 /// </summary> /// <param name="blackWord"></param> /// <returns></returns> private bool Check(string blackWord) { wordlenght = 0; //检测源下一位游标 nextCursor = cursor + 1; bool found = false; //遍历词的每一位做匹配 for (int i = 0; i < blackWord.Length; i++) { //特殊字符偏移游标 int offset = 0; if (nextCursor >= sourctText.Length) { break; } else { //检测下位字符如果不是汉字 数字 字符 偏移量加1 for (int y = nextCursor; y < sourctText.Length; y++) { if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y])) { offset++; //避让特殊字符,下位游标如果>=字符串长度 跳出 if (nextCursor + offset >= sourctText.Length) break; wordlenght++; } else break; } if ((int)blackWord[i] == (int)sourctText[nextCursor + offset]) { found = true; } else { found = false; break; } } nextCursor = nextCursor + 1 + offset; wordlenght++; } return found; } /// <summary> /// 检测并替换敏感词为指定字符。之后返回 /// </summary> /// <param name="replaceChar">比如:*</param> public string getDataByFilter(string sSourceInput, char replaceChar) { if (string.IsNullOrEmpty(sSourceInput)) { return sSourceInput; } if (MEMORYLEXICON == null || MEMORYLEXICON.Length == 0) { SaveLog($"SensitiveWordFilter.getDataByFilter.内存字典为空"); return sSourceInput; } //初始化 this.cursor = 0; this.wordlenght = 0; this.illegalWords.Clear(); this.sourctText = sSourceInput; if (sourctText != string.Empty) { char[] tempString = sourctText.ToCharArray(); for (int i = 0; i < SourctText.Length; i++) { //查询以该字为首字符的词组 WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]]; if (group != null) { for (int z = 0; z < group.Count(); z++) { string word = group.GetWord(z); if (word.Length == 0 || Check(word)) { string blackword = string.Empty; for (int pos = 0; pos < wordlenght + 1; pos++) { blackword += tempString[pos + cursor].ToString(); tempString[pos + cursor] = replaceChar; } illegalWords.Add(blackword); cursor = cursor + wordlenght; i = i + wordlenght; } } } cursor++; } return new string(tempString); } else { return string.Empty; } } } /// <summary> /// 具有相同首字符的词组集合 /// </summary> public class WordGroup { /// <summary> /// 集合 /// </summary> private List<string> groupList=new List<string>(); public WordGroup() { } /// <summary> /// 添加词 /// </summary> /// <param name="word"></param> public void Add(string word) { if (groupList.Contains(word) == false) { groupList.Add(word); } } /// <summary> /// 获取总数 /// </summary> /// <returns></returns> public int Count() { return groupList.Count; } /// <summary> /// 根据下标获取词 /// </summary> /// <param name="index"></param> /// <returns></returns> public string GetWord(int index) { return groupList[index]; } } }
上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:
//全局配置,整个程序只要配置一次即可,后续无需配置 SensitiveWordFilter.Init(new string[] { @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\暴恐词库.txt", @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\反动词库.txt", @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\民生词库.txt", @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\色情词库.txt", @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\贪腐词库.txt", @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\其他词库.txt" }); //下列可以在多个地方实例化,可以并发执行 SensitiveWordFilter wordFilter = new SensitiveWordFilter(); Dictionary<string, string> dictTestData = new Dictionary<string, string>(); //多测几个示例,看看效果 dictTestData["杀^人游戏,有人找一夜q"] = ""; dictTestData["数学学习课堂"] = ""; dictTestData["打击法0功有,法0功毒害大众"] = ""; Dictionary<string, string> dictResult = new Dictionary<string, string>(); foreach(string sKey in dictTestData.Keys) { dictResult[sKey] = $"替换后:{wordFilter.getDataByFilter(sKey,'|')}, ------------检测违禁词:{string.Join(",",(wordFilter.IllegalWords==null?new List<string>():wordFilter.IllegalWords))}"; } string sResultJson = JsonConverter.SerializeObject(dictResult); Utils.SaveLog(sResultJson);
最后,给一下打印的结果:
“杀^人游戏,有人找一夜q”: 替换后: “杀^人游戏,有人找|||”, ————检测违禁词:一夜q”,
“数学学习课堂”: 替换后:”数学学习课堂”, ————检测违禁词:,
“打击法0功有,法0功毒害大众”: 替换后:“打击|||有,|||毒害大众”, ————检测违禁词:法0功,法0功”