sakura-editor
diff --git a/‎sakura_core/parse/CWordParse.cpp
Lines changed: 199 additions & 0 deletions b/‎sakura_core/parse/CWordParse.cpp
Lines changed: 199 additions & 0 deletions
diff --git a/‎sakura_core/parse/CWordParse.h
Lines changed: 13 additions & 0 deletions b/‎sakura_core/parse/CWordParse.h
Lines changed: 13 additions & 0 deletions
@@ -308,3 +308,202 @@ bool CWordParse::SearchNextWordPosition4KW(
 	}
 	return false;
 }
+
+//! wcがasciiなら0-127のまま返す。それ以外は0を返す。
+uchar_t wc_to_c(wchar_t wc)
+{
+#if 0
+//! wcがSJIS1バイト文字ならcharに変換して0～255を返す。SJIS2バイト文字なら0を返す。
+	char buf[3]={0,0,0};
+	int ret=wctomb(buf,wc);
+	if(ret==-1)return 0;   //エラー
+	if(buf[1]!=0)return 0; //エラー扱い
+	return buf[0] <= 0x7F ? buf[0]: 0; //1バイトで表せたので、これを返す  2011.12.17 バッファオーバーランの修正
+#endif
+	// 2011.12.15 wctombを使わない版
+	if(wc <= 0x7F){
+		return (uchar_t)wc;
+	}
+	return 0;
+}
+
+//@@@ 2002.01.24 Start by MIK
+/*!
+	文字列がURLかどうかを検査する。
+	
+	@retval TRUE URLである
+	@retval FALSE URLでない
+	
+	@note 関数内に定義したテーブルは必ず static const 宣言にすること(性能に影響します)。
+		url_char の値は url_table の配列番号+1 になっています。
+		新しい URL を追加する場合は #define 値を修正してください。
+		url_table は頭文字がアルファベット順になるように並べてください。
+
+	2007.10.23 kobake UNICODE対応。//$ wchar_t専用のテーブル(または判定ルーチン)を用意したほうが効率は上がるはずです。
+*/
+BOOL IsURL(
+	const wchar_t*	pszLine,	//!< [in]  文字列
+	int				offset,	//!< [in]  検査を開始する位置。
+	int				nLineLen,	//!< [in]  文字列の長さ
+	int*			pnMatchLen	//!< [out] URLの長さ。offset からの距離。
+)
+{
+	struct _url_table_t {
+		wchar_t	name[12];
+		int		length;
+		bool	is_mail;
+	};
+	static const struct _url_table_t	url_table[] = {
+		/* アルファベット順 */
+		{ L"file://",		7,	false }, /* 1 */
+		{ L"ftp://",		6,	false }, /* 2 */
+		{ L"gopher://",		9,	false }, /* 3 */
+		{ L"http://",		7,	false }, /* 4 */
+		{ L"https://",		8,	false }, /* 5 */
+		{ L"mailto:",		7,	true  }, /* 6 */
+		{ L"news:",			5,	false }, /* 7 */
+		{ L"nntp://",		7,	false }, /* 8 */
+		{ L"prospero://",	11,	false }, /* 9 */
+		{ L"telnet://",		9,	false }, /* 10 */
+		{ L"tp://",			5,	false }, /* 11 */	//2004.02.02
+		{ L"ttp://",		6,	false }, /* 12 */	//2004.02.02
+		{ L"wais://",		7,	false }, /* 13 */
+		{ L"{",				0,	false }  /* 14 */  /* '{' is 'z'+1 : terminate */
+	};
+
+/* テーブルの保守性を高めるための定義 */
+	const char urF = 1;
+	const char urG = 3;
+	const char urH = 4;
+	const char urM = 6;
+	const char urN = 7;
+	const char urP = 9;
+	const char urT = 10;
+	const char urW = 13;	//2004.02.02
+
+	static const char	url_char[] = {
+	  /* +0  +1  +2  +3  +4  +5  +6  +7  +8  +9  +A  +B  +C  +D  +E  +F */
+		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,	/* +00: */
+		  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,	/* +10: */
+		  0, -1,  0, -1, -1, -1, -1,  0,  0,  0,  0, -1, -1, -1, -1, -1,	/* +20: " !"#$%&'()*+,-./" */
+		 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1,  0, -1,	/* +30: "0123456789:;<=>?" */
+		 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,	/* +40: "@ABCDEFGHIJKLMNO" */
+		 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1,  0,  0, -1,	/* +50: "PQRSTUVWXYZ[\]^_" */
+		  0, -1, -1, -1, -1, -1,urF,urG,urH, -1, -1, -1, -1,urM,urN, -1,	/* +60: "`abcdefghijklmno" */
+		urP, -1, -1, -1,urT, -1, -1,urW, -1, -1, -1,  0,  0,  0, -1,  0,	/* +70: "pqrstuvwxyz{|}~ " */
+		/* あと128バイト犠牲にすればif文を2箇所削除できる */
+		/* 0    : not url char
+		 * -1   : url char
+		 * other: url head char --> url_table array number + 1
+		 */
+	};
+
+	const wchar_t * const begin = pszLine + offset;
+	const wchar_t * const end   = pszLine + nLineLen;
+	const struct _url_table_t	*urlp;
+	int	i;
+
+	if( wc_to_c(*begin)==0 ) return FALSE;	/* 2バイト文字 */
+	if( 0 < url_char[wc_to_c(*begin)] ){	/* URL開始文字 */
+		for(urlp = &url_table[url_char[wc_to_c(*begin)]-1]; urlp->name[0] == wc_to_c(*begin); urlp++){	/* URLテーブルを探索 */
+			if( (urlp->length <= end - begin) && (auto_memcmp(urlp->name, begin, urlp->length) == 0) ){	/* URLヘッダは一致した */
+				if( urlp->is_mail ){	/* メール専用の解析へ */
+					if( IsMailAddress(begin, urlp->length, end - begin - urlp->length, pnMatchLen) ){
+						*pnMatchLen = *pnMatchLen + urlp->length;
+						return TRUE;
+					}
+					return FALSE;
+				}
+				for(i = urlp->length; i < end - begin; i++){	/* 通常の解析へ */
+					if( wc_to_c(begin[i])==0 || (!(url_char[wc_to_c(begin[i])])) ) break;	/* 終端に達した */
+				}
+				if( i == urlp->length ) return FALSE;	/* URLヘッダだけ */
+				*pnMatchLen = i;
+				return TRUE;
+			}
+		}
+	}
+	return IsMailAddress(pszLine, offset, nLineLen, pnMatchLen);
+}
+
+/* 現在位置がメールアドレスならば、NULL以外と、その長さを返す
+	@date 2016.04.27 記号類を許可
+*/
+BOOL IsMailAddress( const wchar_t* pszBuf, int offset, int nBufLen, int* pnAddressLenfth )
+{
+	struct {
+		bool operator()(const wchar_t ch)
+		{
+			return 0x21 <= ch && ch <= 0x7E && NULL == wcschr(L"\"(),:;<>@[\\]", ch);
+		}
+	} IsValidChar;
+
+/*
+	直前の文字を利用した境界判定
+*/
+	if (0 < offset && IsValidChar(pszBuf[offset-1])) {
+		return FALSE;
+	}
+
+	pszBuf  += offset;
+	nBufLen -= offset;
+	offset   = 0;
+
+	int		j;
+	int		nDotCount;
+	int		nBgn;
+
+
+	j = 0;
+	if(pszBuf[j] != L'.' && IsValidChar(pszBuf[j])){
+		j++;
+	}else{
+		return FALSE;
+	}
+	while( j < nBufLen - 2 && IsValidChar(pszBuf[j]) ){
+		j++;
+	}
+	if( j == 0 || j >= nBufLen - 2  ){
+		return FALSE;
+	}
+	if( L'@' != pszBuf[j] ){
+		return FALSE;
+	}
+//	nAtPos = j;
+	j++;
+	nDotCount = 0;
+//	nAlphaCount = 0;
+
+
+	for (;;) {
+		nBgn = j;
+		while( j < nBufLen &&
+			(
+			(pszBuf[j] >= L'a' && pszBuf[j] <= L'z')
+		 || (pszBuf[j] >= L'A' && pszBuf[j] <= L'Z')
+		 || (pszBuf[j] >= L'0' && pszBuf[j] <= L'9')
+		 || (pszBuf[j] == L'-')
+		 || (pszBuf[j] == L'_')
+			)
+		){
+			j++;
+		}
+		if( 0 == j - nBgn ){
+			return FALSE;
+		}
+		if( L'.' != pszBuf[j] ){
+			if( 0 == nDotCount ){
+				return FALSE;
+			}else{
+				break;
+			}
+		}else{
+			nDotCount++;
+			j++;
+		}
+	}
+	if( NULL != pnAddressLenfth ){
+		*pnAddressLenfth = j;
+	}
+	return TRUE;
+}
@@ -125,6 +125,19 @@ class CWordParse{
 	static bool _match_charlist( const WCHAR c, const WCHAR *pszList );
 };
 
+BOOL IsURL( const wchar_t* psz, int offset, int length, int* outLength);/* offset 引数の追加により境界判定が行える高速版 */
+inline
+BOOL IsURL( const wchar_t* psz, int length, int* outLength) /* 指定アドレスがURLの先頭ならばTRUEとその長さを返す。高速版の追加により obsolete. */
+{
+	return IsURL(psz, 0, length, outLength);
+}
+BOOL IsMailAddress( const wchar_t* psz, int offset, int length, int* outLength); /* offset 引数の追加により境界判定が行える高速版 */
+inline
+BOOL IsMailAddress( const wchar_t* psz, int length, int* outLength) /* 現在位置がメールアドレスならば、NULL以外と、その長さを返す。高速版の追加により obsolete. */
+{
+	return IsMailAddress(psz, 0, length, outLength);
+}
+
 // ACHAR 版
 inline bool CWordParse::_match_charlist( const ACHAR c, const ACHAR *pszList )
 {