Skip to content

Commit 8047dbd

Browse files
committed
为发布0.6.0准备
1 parent 9440021 commit 8047dbd

File tree

3 files changed

+263
-0
lines changed

3 files changed

+263
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
7+
namespace Iveely.SearchEngine
8+
{
9+
/// <summary>
10+
/// 信息抽取
11+
/// </summary>
12+
public class EntityExtrator
13+
{
14+
/// <summary>
15+
/// 实体抽取
16+
/// </summary>
17+
private List<string> EntityPatterns = new List<string>();
18+
19+
/// <summary>
20+
/// 实体关系抽取
21+
/// </summary>
22+
private List<string> RelationPatterns = new List<string>();
23+
24+
/// <summary>
25+
/// 词性分析组件
26+
/// </summary>
27+
Iveely.Framework.Text.HMMSegment mse = Framework.Text.HMMSegment.GetInstance();
28+
29+
public EntityExtrator()
30+
{
31+
//patterns.Add("");
32+
}
33+
34+
public string[] GetInfo(string text)
35+
{
36+
Tuple<string[], string[]> tuple = mse.SplitToArray(text);
37+
return tuple.Item2;
38+
}
39+
}
40+
}

Iveely.SearchEngine/Iveely.SearchEngine.csproj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,15 @@
4646
<Reference Include="System.Xml" />
4747
</ItemGroup>
4848
<ItemGroup>
49+
<Compile Include="BaikeDataCrawler.cs" />
4950
<Compile Include="Crawler.cs" />
5051
<Compile Include="Host.cs" />
5152
<Compile Include="Backstage.cs">
5253
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
5354
</Compile>
5455
<Compile Include="Index.cs" />
56+
<Compile Include="EntityExtrator.cs" />
57+
<Compile Include="KnowlegeIndex.cs" />
5558
<Compile Include="Library.cs" />
5659
<Compile Include="Properties\AssemblyInfo.cs" />
5760
<Compile Include="QuestionGetter.cs">
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
using System;
2+
using System.Collections;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
using Iveely.Data;
8+
using Iveely.Database;
9+
using Iveely.Framework.DataStructure;
10+
11+
namespace Iveely.SearchEngine
12+
{
13+
public class KnowlegeIndex
14+
{
15+
public class KnowledgeEntity
16+
{
17+
18+
/// <summary>
19+
/// 编号
20+
/// </summary>
21+
public long Id { get; set; }
22+
23+
/// <summary>
24+
/// 实体A
25+
/// </summary>
26+
public string EntityA { get; set; }
27+
28+
/// <summary>
29+
/// 实体B
30+
/// </summary>
31+
public string EntityB { get; set; }
32+
33+
/// <summary>
34+
/// 关系
35+
/// </summary>
36+
public string Relation { get; set; }
37+
38+
/// <summary>
39+
/// 问题描述
40+
/// </summary>
41+
public string QuestionDesc { get; set; }
42+
43+
/// <summary>
44+
/// 问题答案
45+
/// </summary>
46+
public string Answer { get; set; }
47+
48+
/// <summary>
49+
/// 参考来自
50+
/// </summary>
51+
public string RefUrl { get; set; }
52+
53+
/// <summary>
54+
/// 有效期
55+
/// </summary>
56+
public string EffectTime { get; set; }
57+
}
58+
59+
/// <summary>
60+
/// 索引信息
61+
/// </summary>
62+
public class KeywordIndex
63+
{
64+
public string Keyword;
65+
public long Id;
66+
public double Weight;
67+
}
68+
69+
/// <summary>
70+
/// 分词组件
71+
/// </summary>
72+
private static Iveely.Framework.Text.HMMSegment segment;
73+
74+
/// <summary>
75+
/// 问题提取
76+
/// </summary>
77+
private QuestionGetter questionGetter;
78+
79+
/// <summary>
80+
/// 实体集
81+
/// </summary>
82+
private List<KnowledgeEntity> entities;
83+
84+
/// <summary>
85+
/// 最长长度
86+
/// </summary>
87+
private const int MAXCOUNT = 100;
88+
89+
/// <summary>
90+
/// 当前编号
91+
/// </summary>
92+
private long currentId = 0;
93+
94+
/// <summary>
95+
/// 临时存放文本索引数据
96+
/// </summary>
97+
private static List<KeywordIndex> indexs = new List<KeywordIndex>();
98+
99+
public KnowlegeIndex()
100+
{
101+
questionGetter = new QuestionGetter();
102+
entities = new List<KnowledgeEntity>();
103+
segment = Iveely.Framework.Text.HMMSegment.GetInstance();
104+
}
105+
106+
public void Start()
107+
{
108+
string dataPath = "Baike\\Baike_data.db4";
109+
using (IStorageEngine engine = STSdb.FromFile(dataPath))
110+
{
111+
// 1.提取数据
112+
ITable<string, BaikeDataCrawler.Page> table = engine.OpenXTable<string, BaikeDataCrawler.Page>("WebPage");
113+
long totalCount = table.Count();
114+
foreach (var keyValuePair in table)
115+
{
116+
Console.WriteLine(totalCount--);
117+
BaikeDataCrawler.Page page = (BaikeDataCrawler.Page)keyValuePair.Value;
118+
119+
// 2.提取问题
120+
List<QuestionGetter.QuestionEntity> questionEntities = questionGetter.GetKnowledge(page.Content);
121+
if (questionEntities != null && questionEntities.Count > 0)
122+
{
123+
foreach (var questionEntity in questionEntities)
124+
{
125+
KnowledgeEntity entity = new KnowledgeEntity();
126+
entity.EntityA = questionEntity.EntityA;
127+
entity.EntityB = questionEntity.EntityB;
128+
entity.QuestionDesc = questionEntity.QuestionDesc;
129+
entity.Answer = questionEntity.Answer;
130+
entity.Relation = questionEntity.Relation;
131+
entity.EffectTime = System.DateTime.Now.ToShortDateString();
132+
entity.RefUrl = page.Url;
133+
entity.Id = currentId++;
134+
entities.Add(entity);
135+
}
136+
}
137+
138+
// 3.存储数据
139+
if (entities.Count > MAXCOUNT)
140+
{
141+
IEnumerable<KnowledgeEntity> ces = entities.Distinct();
142+
if (ces.Count() > 0)
143+
InsertEntity(ces);
144+
entities.Clear();
145+
}
146+
}
147+
}
148+
149+
if (entities != null && entities.Count > 0)
150+
{
151+
IEnumerable<KnowledgeEntity> ces = entities.Distinct();
152+
if (ces.Count() > 0)
153+
InsertEntity(ces);
154+
}
155+
Console.ReadLine();
156+
}
157+
158+
private void InsertEntity(IEnumerable<KnowledgeEntity> ces)
159+
{
160+
string dataPath = "Baike\\Baike_question.db4";
161+
using (IStorageEngine engine = STSdb.FromFile(dataPath))
162+
{
163+
ITable<long, KnowledgeEntity> table = engine.OpenXTable<long, KnowledgeEntity>("WebPage");
164+
foreach (var knowledgeEntity in ces)
165+
{
166+
table[knowledgeEntity.Id] = knowledgeEntity;
167+
InsertIndex(knowledgeEntity.Id, knowledgeEntity.QuestionDesc);
168+
}
169+
engine.Commit();
170+
}
171+
}
172+
173+
private void InsertIndex(long id, string text)
174+
{
175+
string dataPath = "Baike\\Baike_question_index.db4";
176+
var frequency = new IntTable<string, int>();
177+
string[] results = segment.Split(text);
178+
if (results.Length < 1)
179+
{
180+
return;
181+
}
182+
frequency.Add(results);
183+
foreach (DictionaryEntry de in frequency)
184+
{
185+
KeywordIndex keywordIndex = new KeywordIndex();
186+
keywordIndex.Keyword = de.Key.ToString();
187+
keywordIndex.Weight = int.Parse(de.Value.ToString()) * 1.0 / results.Length;
188+
keywordIndex.Id = id;
189+
indexs.Add(keywordIndex);
190+
}
191+
if (indexs.Count > 0)
192+
{
193+
using (IStorageEngine engine = STSdb.FromFile(dataPath))
194+
{
195+
ITable<string, List<Iveely.Data.Slots<long, double>>> table = engine.OpenXTable<string, List<Iveely.Data.Slots<long, double>>>("WebPage");
196+
foreach (var keywordIndex in indexs)
197+
{
198+
// 如果包含则追加
199+
List<Iveely.Data.Slots<long, double>> list = table.Find(keywordIndex.Keyword);
200+
if (list != null && list.Count > 0)
201+
{
202+
Iveely.Data.Slots<long, double> slot = new Slots<long, double>(keywordIndex.Id, keywordIndex.Weight);
203+
list.Add(slot);
204+
}
205+
// 否则新增
206+
else
207+
{
208+
list = new List<Slots<long, double>>();
209+
Iveely.Data.Slots<long, double> slot = new Slots<long, double>(keywordIndex.Id, keywordIndex.Weight);
210+
list.Add(slot);
211+
table[keywordIndex.Keyword] = list;
212+
}
213+
}
214+
engine.Commit();
215+
}
216+
indexs.Clear();
217+
}
218+
}
219+
}
220+
}

0 commit comments

Comments
 (0)