-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.html
More file actions
3384 lines (1991 loc) · 290 KB
/
Copy pathindex.html
File metadata and controls
3384 lines (1991 loc) · 290 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html class="theme-next gemini use-motion" lang="zh-Hans">
<head>
<meta charset="UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>
<meta name="theme-color" content="#222">
<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />
<link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css" />
<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />
<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css" />
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=5.1.4">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=5.1.4">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png?v=5.1.4">
<link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">
<meta name="keywords" content="Hexo, NexT" />
<meta name="description" content="MoriatyC的学习心得">
<meta property="og:type" content="website">
<meta property="og:title" content="Be Better">
<meta property="og:url" content="http://yoursite.com/index.html">
<meta property="og:site_name" content="Be Better">
<meta property="og:description" content="MoriatyC的学习心得">
<meta property="og:locale" content="zh-Hans">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Be Better">
<meta name="twitter:description" content="MoriatyC的学习心得">
<script type="text/javascript" id="hexo.configurations">
var NexT = window.NexT || {};
var CONFIG = {
root: '/',
scheme: 'Gemini',
version: '5.1.4',
sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
fancybox: true,
tabs: true,
motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
duoshuo: {
userId: '0',
author: '博主'
},
algolia: {
applicationID: '',
apiKey: '',
indexName: '',
hits: {"per_page":10},
labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
}
};
</script>
<link rel="canonical" href="http://yoursite.com/"/>
<title>Be Better</title>
<script type="text/javascript">
var _hmt = _hmt || [];
(function() {
var hm = document.createElement("script");
hm.src = "https://hm.baidu.com/hm.js?437ed2136949939c261ae148b39218da";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
</head>
<body itemscope itemtype="http://schema.org/WebPage" lang="zh-Hans">
<div class="container sidebar-position-left
page-home">
<div class="headband"></div>
<header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
<div class="header-inner"><div class="site-brand-wrapper">
<div class="site-meta ">
<div class="custom-logo-site-title">
<a href="/" class="brand" rel="start">
<span class="logo-line-before"><i></i></span>
<span class="site-title">Be Better</span>
<span class="logo-line-after"><i></i></span>
</a>
</div>
<p class="site-subtitle">厚积薄发</p>
</div>
<div class="site-nav-toggle">
<button>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
</button>
</div>
</div>
<nav class="site-nav">
<ul id="menu" class="menu">
<li class="menu-item menu-item-home">
<a href="/" rel="section">
<i class="menu-item-icon fa fa-fw fa-home"></i> <br />
首页
</a>
</li>
<li class="menu-item menu-item-tags">
<a href="/tags/" rel="section">
<i class="menu-item-icon fa fa-fw fa-tags"></i> <br />
标签
</a>
</li>
<li class="menu-item menu-item-categories">
<a href="/categories/" rel="section">
<i class="menu-item-icon fa fa-fw fa-th"></i> <br />
分类
</a>
</li>
<li class="menu-item menu-item-archives">
<a href="/archives/" rel="section">
<i class="menu-item-icon fa fa-fw fa-archive"></i> <br />
归档
</a>
</li>
<li class="menu-item menu-item-sitemap">
<a href="/sitemap.xml" rel="section">
<i class="menu-item-icon fa fa-fw fa-question-circle"></i> <br />
Sitemap
</a>
</li>
<li class="menu-item menu-item-search">
<a href="javascript:;" class="popup-trigger">
<i class="menu-item-icon fa fa-search fa-fw"></i> <br />
搜索
</a>
</li>
</ul>
<div class="site-search">
<div class="popup search-popup local-search-popup">
<div class="local-search-header clearfix">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<span class="popup-btn-close">
<i class="fa fa-times-circle"></i>
</span>
<div class="local-search-input-wrapper">
<input autocomplete="off"
placeholder="搜索..." spellcheck="false"
type="text" id="local-search-input">
</div>
</div>
<div id="local-search-result"></div>
</div>
</div>
</nav>
</div>
</header>
<main id="main" class="main">
<div class="main-inner">
<div class="content-wrap">
<div id="content" class="content">
<section id="posts" class="posts-expand">
<article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="http://yoursite.com/2018/05/11/负载均衡/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="name" content="MoriatyC">
<meta itemprop="description" content="">
<meta itemprop="image" content="/images/avatar.gif">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Be Better">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
<a class="post-title-link" href="/2018/05/11/负载均衡/" itemprop="url">负载均衡</a></h1>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建于" itemprop="dateCreated datePublished" datetime="2018-05-11T10:48:06+08:00">
2018-05-11
</time>
</span>
<span class="post-category" >
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/架构/" itemprop="url" rel="index">
<span itemprop="name">架构</span>
</a>
</span>
</span>
<span id="/2018/05/11/负载均衡/" class="leancloud_visitors" data-flag-title="负载均衡">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-eye"></i>
</span>
<span class="post-meta-item-text">阅读次数:</span>
<span class="leancloud-visitors-count"></span>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<h1 id="一-方法"><a href="#一-方法" class="headerlink" title="一. 方法"></a>一. 方法</h1><h2 id="1-HTTP重定向负载均衡"><a href="#1-HTTP重定向负载均衡" class="headerlink" title="1. HTTP重定向负载均衡"></a>1. HTTP重定向负载均衡</h2><p>将http请求通过302的临时重定向给实际服务器</p>
<ul>
<li>优点: 简单</li>
<li>缺点: 需要请求量词服务器,性能差; 重定向服务器称为瓶颈;可能被判断为seo作弊,降低搜索排名</li>
</ul>
<h2 id="2-DNS域名解析负载均衡"><a href="#2-DNS域名解析负载均衡" class="headerlink" title="2. DNS域名解析负载均衡"></a>2. DNS域名解析负载均衡</h2><p>在DNS服务器中进行负载均衡算法,分配相应的应用服务器</p>
<ul>
<li>优点: 将负载均衡工作转交给DNS,DNS本身支持就近分配</li>
<li><p>缺点: DNS多级解析,下线某台服务器不能及时更新;控制权在域名服务商,无法做跟多的改善和管理</p>
</li>
<li><p>实际: 使用DNS作为一级负载均衡,得到内部的负载均衡服务器,再通过内部负载均衡服务器进行web服务器的分配。</p>
</li>
</ul>
<h2 id="3-反向代理负载均衡"><a href="#3-反向代理负载均衡" class="headerlink" title="3. 反向代理负载均衡"></a>3. 反向代理负载均衡</h2><ul>
<li>优点: 属于七层负载均衡,和反向代理服务器功能集成在一起部署简单</li>
<li>缺点: 反向代理服务器是所有请求和相应的中转站,可能成为瓶颈</li>
</ul>
<h2 id="4-IP负载均衡"><a href="#4-IP负载均衡" class="headerlink" title="4. IP负载均衡"></a>4. IP负载均衡</h2><ul>
<li>修改数据包而不是转发,在负载均衡服务器将目的地址改成web服务器,响应数据包再回到负载均衡服务器,在负载均衡服务器将数据包源地址修改为自身</li>
<li>优点: 在内核进程完成数据分发性能更好</li>
<li>缺点: 集群的最大响应数据吞吐量不得不受制于负载均衡服务器网卡宽带</li>
</ul>
<h2 id="5-数据链路层负载均衡"><a href="#5-数据链路层负载均衡" class="headerlink" title="5. 数据链路层负载均衡"></a>5. 数据链路层负载均衡</h2><p><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/load.png" alt=""></p>
<ul>
<li>三角传输模式,不修改IP地址,只修改mac地址,通过配置真是物理服务器所有机器虚拟IP和负载均衡服务器IP地址一样,使得不修改数据包源地址和目的地址就可以进行数据分发。</li>
<li>优点: 不需要负载均衡服务器进行地址转换,可直接将响应数据包直接返回给用户浏览器,这种方式又叫直接路由方式DR</li>
</ul>
<h1 id="二-常见负载均衡算法"><a href="#二-常见负载均衡算法" class="headerlink" title="二. 常见负载均衡算法"></a>二. 常见负载均衡算法</h1><ol>
<li><p>轮询</p>
</li>
<li><p>加权轮询</p>
<figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">IpMap</span> </span>{</span><br><span class="line"> <span class="comment">/**</span></span><br><span class="line"><span class="comment"> * key: Ip</span></span><br><span class="line"><span class="comment"> * Value: 权重</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"> <span class="keyword">public</span> <span class="keyword">static</span> LinkedHashMap<String, Integer> serverWeightMap = <span class="keyword">new</span> LinkedHashMap<>();</span><br><span class="line"> <span class="keyword">static</span> {</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.100"</span>, <span class="number">1</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.101"</span>, <span class="number">1</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.102"</span>, <span class="number">4</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.103"</span>, <span class="number">1</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.104"</span>, <span class="number">1</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.105"</span>, <span class="number">3</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.106"</span>, <span class="number">1</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.107"</span>, <span class="number">2</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.108"</span>, <span class="number">1</span>);</span><br><span class="line"> serverWeightMap.put(<span class="string">"192.168.1.109"</span>, <span class="number">1</span>);</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">getServerByAddWeightRoundRobin</span><span class="params">()</span> </span>{</span><br><span class="line"> Map<String, Integer> serverMap = <span class="keyword">new</span> LinkedHashMap<>();</span><br><span class="line"> <span class="comment">//IpMap为web服务器集合</span></span><br><span class="line"> serverMap.putAll(IpMap.serverWeightMap);</span><br><span class="line"> Set<String> keySet = serverMap.keySet();</span><br><span class="line"> Iterator<String> iterator = keySet.iterator();</span><br><span class="line"></span><br><span class="line"> List<String> serverList = <span class="keyword">new</span> ArrayList<String>();</span><br><span class="line"> <span class="keyword">while</span> (iterator.hasNext()) {</span><br><span class="line"> String server = iterator.next();</span><br><span class="line"> <span class="keyword">int</span> weight = serverMap.get(server);</span><br><span class="line"> <span class="keyword">for</span> (<span class="keyword">int</span> i = <span class="number">0</span>; i < weight; i++)</span><br><span class="line"> serverList.add(server);</span><br><span class="line"> }</span><br><span class="line"> String server = <span class="keyword">null</span>;</span><br><span class="line"> <span class="keyword">synchronized</span> (pos) {</span><br><span class="line"> <span class="keyword">if</span> (pos == keySet.size()) {</span><br><span class="line"> pos = <span class="number">0</span>;</span><br><span class="line"> }</span><br><span class="line"> server = serverList.get(pos);</span><br><span class="line"> pos++;</span><br><span class="line"> }</span><br><span class="line"> <span class="keyword">return</span> server;</span><br><span class="line"> }</span><br></pre></td></tr></table></figure>
</li>
<li><p>随机</p>
</li>
<li>最少连接</li>
</ol>
<p>记录每个应用服务器正在处理的连接数,将新到的请求分发到最少连接服务器上。</p>
<ol>
<li>源地址散列(一致性hash)</li>
</ol>
<h1 id="三-一致性hash"><a href="#三-一致性hash" class="headerlink" title="三. 一致性hash"></a>三. 一致性hash</h1><p>一致性哈希算法是分布式系统中常用的算法。比如,一个分布式的存储系统,要将数据存储到具体的节点上,<br>如果采用普通的hash方法,将数据映射到具体的节点上,如key%N,key是数据的key,N是机器节点数,<br>如果有一个机器加入或退出这个集群,则所有的数据映射都无效了。</p>
<p>一致性哈希算法解决了普通余数Hash算法伸缩性差的问题,可以保证在上线、下线服务器的情况下尽量有多的请求命中原来路由到的服务器。</p>
<h2 id="1-原理"><a href="#1-原理" class="headerlink" title="1.原理"></a>1.原理</h2><h3 id="1-环形Hash空间"><a href="#1-环形Hash空间" class="headerlink" title="(1). 环形Hash空间"></a>(1). 环形Hash空间</h3><p>按照常用的hash算法来将对应的key哈希到一个具有2^32次方个桶的空间中,即0~(2^32)-1的数字空间中。</p>
<p>现在我们可以将这些数字头尾相连,想象成一个闭合的环形。如下图<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/yizhihash.png" alt=""></p>
<h3 id="2-把数据通过一定的hash算法处理后映射到环上"><a href="#2-把数据通过一定的hash算法处理后映射到环上" class="headerlink" title="(2). 把数据通过一定的hash算法处理后映射到环上"></a>(2). 把数据通过一定的hash算法处理后映射到环上</h3><p>现在我们将object1、object2、object3、object4四个对象通过特定的Hash函数计算出对应的key值,然后散列到Hash环上。如下图:<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/hash2.png" alt=""></p>
<p>Hash(object1) = key1;<br>Hash(object2) = key2;<br>Hash(object3) = key3;<br>Hash(object4) = key4;</p>
<h3 id="3-将机器通过hash算法映射到环上"><a href="#3-将机器通过hash算法映射到环上" class="headerlink" title="(3). 将机器通过hash算法映射到环上"></a>(3). 将机器通过hash算法映射到环上</h3><p>在采用一致性哈希算法的分布式集群中将新的机器加入,其原理是通过使用与对象存储一样的Hash算法将机器也映射到环中</p>
<p>(一般情况下对机器的hash计算是采用机器的IP或者机器唯一的别名作为输入值),然后以顺时针的方向计算,将所有对象存储到离自己最近的机器中。</p>
<p>假设现在有NODE1,NODE2,NODE3三台机器,通过Hash算法得到对应的KEY值,映射到环中,其示意图如下:<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/hash3.png" alt=""></p>
<p>Hash(NODE1) = KEY1;<br>Hash(NODE2) = KEY2;<br>Hash(NODE3) = KEY3;</p>
<p>通过上图可以看出对象与机器处于同一哈希空间中,这样按顺时针转动object1存储到了NODE1中,object3存储到了NODE2中,object2、object4存储到了NODE3中。</p>
<p>在这样的部署环境中,hash环是不会变更的,因此,通过算出对象的hash值就能快速的定位到对应的机器中,这样就能找到对象真正的存储位置了。</p>
<h2 id="2-机器的删除与添加"><a href="#2-机器的删除与添加" class="headerlink" title="2.机器的删除与添加"></a>2.机器的删除与添加</h2><p>普通hash求余算法最为不妥的地方就是在有机器的添加或者删除之后会造成大量的对象存储位置失效。下面来分析一下一致性哈希算法是如何处理的。</p>
<h3 id="1-节点(机器)的删除"><a href="#1-节点(机器)的删除" class="headerlink" title="(1). 节点(机器)的删除"></a>(1). 节点(机器)的删除</h3><p>以上面的分布为例,如果NODE2出现故障被删除了,那么按照顺时针迁移的方法,object3将会被迁移到NODE3中,这样仅仅是object3的映射位置发生了变化,其它的对象没有任何的改动。如下图:<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/hash11.png" alt=""></p>
<h3 id="2-节点(机器)的添加"><a href="#2-节点(机器)的添加" class="headerlink" title="(2). 节点(机器)的添加"></a>(2). 节点(机器)的添加</h3><p>如果往集群中添加一个新的节点NODE4,通过对应的哈希算法得到KEY4,并映射到环中,如下图:<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/hash22.png" alt=""></p>
<p>通过按顺时针迁移的规则,那么object2被迁移到了NODE4中,其它对象还保持着原有的存储位置。<br>通过对节点的添加和删除的分析,一致性哈希算法在保持了单调性的同时,还是数据的迁移达到了最小,这样的算法对分布式集群来说是非常合适的,避免了大量数据迁移,减小了服务器的的压力。</p>
<h2 id="3-平衡性–虚拟节点"><a href="#3-平衡性–虚拟节点" class="headerlink" title="3.平衡性–虚拟节点"></a>3.平衡性–虚拟节点</h2><p>根据上面的图解分析,一致性哈希算法满足了单调性和负载均衡的特性以及一般hash算法的分散性,但这还并不能当做其被广泛应用的原由,</p>
<p>因为还缺少了平衡性。下面将分析一致性哈希算法是如何满足平衡性的。</p>
<p>hash算法是不保证平衡的,如上面只部署了NODE1和NODE3的情况(NODE2被删除的图),object1存储到了NODE1中,而object2、object3、object4都存储到了NODE3中,这样就造成了非常不平衡的状态。在一致性哈希算法中,为了尽可能的满足平衡性,其引入了虚拟节点。</p>
<p>——“虚拟节点”( virtual node )是实际节点(机器)在 hash 空间的复制品( replica ),一个实际节点(机器)对应了若干个“虚拟节点”,这个对应个数也成为“复制个数”,“虚拟节点”在 hash 空间中以hash值排列。</p>
<p>以上面只部署了NODE1和NODE3的情况(NODE2被删除的图)为例,之前的对象在机器上的分布很不均衡,现在我们以2个副本(复制个数)为例,这样整个hash环中就存在了4个虚拟节点,最后对象映射的关系图如下:</p>
<p><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/hash44.png" alt=""></p>
<p>根据上图可知对象的映射关系:object1->NODE1-1,object2->NODE1-2,object3->NODE3-2,object4->NODE3-1。通过虚拟节点的引入,对象的分布就比较均衡了。那么在实际操作中,正真的对象查询是如何工作的呢?对象从hash到虚拟节点到实际节点的转换如下图:<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/hash55.png" alt=""></p>
<p>“虚拟节点”的hash计算可以采用对应节点的IP地址加数字后缀的方式。例如假设NODE1的IP地址为192.168.1.100。引入“虚拟节点”前,计算 cache A 的 hash 值:<br>Hash(“192.168.1.100”);<br>引入“虚拟节点”后,计算“虚拟节”点NODE1-1和NODE1-2的hash值:<br>Hash(“192.168.1.100#1”); // NODE1-1<br>Hash(“192.168.1.100#2”); // NODE1-2</p>
<h2 id="4-实现"><a href="#4-实现" class="headerlink" title="4.实现"></a>4.实现</h2><figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br><span class="line">108</span><br><span class="line">109</span><br><span class="line">110</span><br><span class="line">111</span><br><span class="line">112</span><br><span class="line">113</span><br><span class="line">114</span><br><span class="line">115</span><br><span class="line">116</span><br><span class="line">117</span><br><span class="line">118</span><br><span class="line">119</span><br><span class="line">120</span><br><span class="line">121</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">LoadBalancer</span><<span class="title">T</span>> </span>{</span><br><span class="line"> <span class="keyword">private</span> <span class="keyword">static</span> Integer pos = <span class="number">0</span>;</span><br><span class="line"> <span class="keyword">private</span> <span class="keyword">static</span> MessageDigest md5 = <span class="keyword">null</span>;</span><br><span class="line"> <span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">int</span> numberOfReplicas;</span><br><span class="line"> <span class="keyword">private</span> SortedMap<String, T> circle = <span class="keyword">new</span> TreeMap<>();</span><br><span class="line"> <span class="comment">/**</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@param</span> numberOfReplicas, 虚拟结点个数</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@param</span> nodes, 实际结点集合</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="title">LoadBalancer</span><span class="params">(<span class="keyword">int</span> numberOfReplicas, Collection<T> nodes)</span> </span>{</span><br><span class="line"> <span class="keyword">this</span>.numberOfReplicas = numberOfReplicas;</span><br><span class="line"> <span class="keyword">for</span> (T node: nodes) {</span><br><span class="line"> add(node);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> <span class="comment">/**</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@param</span> node</span></span><br><span class="line"><span class="comment"> * </span></span><br><span class="line"><span class="comment"> * 为每个加入的实际结点添加虚拟结点</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">add</span><span class="params">(T node)</span> </span>{</span><br><span class="line"> <span class="keyword">for</span> (<span class="keyword">int</span> i = <span class="number">0</span>; i < numberOfReplicas; i++) {</span><br><span class="line"> <span class="comment">//key:虚拟结点的string,用类似node1:1, node1:2, node1:3类似用这样的分隔方式设置虚拟节点</span></span><br><span class="line"> <span class="comment">//value:实际结点</span></span><br><span class="line"> circle.put(hash(node.toString() + i), node);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> <span class="comment">/**</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@param</span> node</span></span><br><span class="line"><span class="comment"> * 删除相应实际结点对应的虚拟结点</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">remove</span><span class="params">(T node)</span> </span>{</span><br><span class="line"> <span class="keyword">for</span> (<span class="keyword">int</span> i = <span class="number">0</span>; i < numberOfReplicas; i++) {</span><br><span class="line"> circle.remove(hash(node.toString() + i));</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> </span><br><span class="line"> <span class="comment">/**</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@param</span> key</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@return</span> 返回该key结点将要存储的实际结点</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"> <span class="function"><span class="keyword">public</span> T <span class="title">getRealNode</span><span class="params">(Object key)</span> </span>{</span><br><span class="line"> <span class="keyword">if</span> (circle.isEmpty()) {</span><br><span class="line"> <span class="keyword">return</span> <span class="keyword">null</span>;</span><br><span class="line"> }</span><br><span class="line"> String hash = hash(key.toString());</span><br><span class="line"> <span class="keyword">if</span> (!circle.containsKey(hash)) {</span><br><span class="line"> SortedMap<String, T> tailMap = circle.tailMap(hash);</span><br><span class="line"> hash = tailMap.isEmpty() ? circle.firstKey() : tailMap.firstKey();</span><br><span class="line"> }</span><br><span class="line"> <span class="keyword">return</span> circle.get(hash);</span><br><span class="line"> }</span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">long</span> <span class="title">getSize</span><span class="params">()</span> </span>{</span><br><span class="line"> <span class="keyword">return</span> circle.size();</span><br><span class="line"> }</span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">hash</span><span class="params">(String key)</span> </span>{</span><br><span class="line"> <span class="keyword">if</span> (md5 == <span class="keyword">null</span>) {</span><br><span class="line"> <span class="keyword">try</span> {</span><br><span class="line"> md5 = MessageDigest.getInstance(<span class="string">"MD5"</span>);</span><br><span class="line"> } <span class="keyword">catch</span> (NoSuchAlgorithmException e) {</span><br><span class="line"> <span class="keyword">throw</span> <span class="keyword">new</span> IllegalStateException(<span class="string">"no md5 algrithm found"</span>);</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> md5.reset();</span><br><span class="line"> md5.update(key.getBytes());</span><br><span class="line"> <span class="keyword">byte</span>[] bKey = md5.digest();</span><br><span class="line"> </span><br><span class="line"> <span class="comment">// 将md5摘要转成long格式</span></span><br><span class="line"> <span class="keyword">long</span> result = ((<span class="keyword">long</span>) (bKey[<span class="number">3</span>] & <span class="number">0xFF</span>) << <span class="number">24</span>)</span><br><span class="line"> | ((<span class="keyword">long</span>) (bKey[<span class="number">2</span>] & <span class="number">0xFF</span>) << <span class="number">16</span> </span><br><span class="line"> | ((<span class="keyword">long</span>) (bKey[<span class="number">1</span>] & <span class="number">0xFF</span>) << <span class="number">8</span>) </span><br><span class="line"> | (<span class="keyword">long</span>) (bKey[<span class="number">0</span>] & <span class="number">0xFF</span>));</span><br><span class="line"> <span class="keyword">return</span> String.valueOf(result & <span class="number">0xffffffffL</span>);</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"></span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> String <span class="title">getServerByAddWeightRoundRobin</span><span class="params">()</span> </span>{</span><br><span class="line"> Map<String, Integer> serverMap = <span class="keyword">new</span> LinkedHashMap<>();</span><br><span class="line"> serverMap.putAll(IpMap.serverWeightMap);</span><br><span class="line"> Set<String> keySet = serverMap.keySet();</span><br><span class="line"> Iterator<String> iterator = keySet.iterator();</span><br><span class="line"></span><br><span class="line"> List<String> serverList = <span class="keyword">new</span> ArrayList<String>();</span><br><span class="line"> <span class="keyword">while</span> (iterator.hasNext()) {</span><br><span class="line"> String server = iterator.next();</span><br><span class="line"> <span class="keyword">int</span> weight = serverMap.get(server);</span><br><span class="line"> <span class="keyword">for</span> (<span class="keyword">int</span> i = <span class="number">0</span>; i < weight; i++)</span><br><span class="line"> serverList.add(server);</span><br><span class="line"> }</span><br><span class="line"> String server = <span class="keyword">null</span>;</span><br><span class="line"> <span class="keyword">synchronized</span> (pos) {</span><br><span class="line"> <span class="keyword">if</span> (pos == keySet.size()) {</span><br><span class="line"> pos = <span class="number">0</span>;</span><br><span class="line"> }</span><br><span class="line"> server = serverList.get(pos);</span><br><span class="line"> pos++;</span><br><span class="line"> }</span><br><span class="line"> <span class="keyword">return</span> server;</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line"> </span><br><span class="line"> <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">main</span><span class="params">(String[] args)</span> <span class="keyword">throws</span> NoSuchAlgorithmException </span>{</span><br><span class="line"> Set<String> nodes = <span class="keyword">new</span> HashSet<>();</span><br><span class="line"> nodes.add(<span class="string">"A"</span>);</span><br><span class="line"> nodes.add(<span class="string">"B"</span>);</span><br><span class="line"> nodes.add(<span class="string">"C"</span>);</span><br><span class="line"> nodes.add(<span class="string">"D"</span>);</span><br><span class="line"> nodes.add(<span class="string">"E"</span>);</span><br><span class="line"> LoadBalancer<String> lb = <span class="keyword">new</span> LoadBalancer<>(<span class="number">100</span>, nodes);</span><br><span class="line"> Map<String, Integer> count = <span class="keyword">new</span> HashMap<>();</span><br><span class="line"> <span class="keyword">for</span> (<span class="keyword">int</span> i = <span class="number">0</span>; i < <span class="number">10000000</span>; i++) {</span><br><span class="line"> String real = lb.getRealNode(i + <span class="string">""</span>);</span><br><span class="line"> count.put(real, count.getOrDefault(real, <span class="number">0</span>) + <span class="number">1</span>);</span><br><span class="line"> }</span><br><span class="line"> <span class="keyword">for</span> (String i: count.keySet()) {</span><br><span class="line"> System.out.println(count.get(i));</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"></span><br><span class="line">}</span><br></pre></td></tr></table></figure>
</div>
<footer class="post-footer">
<div class="post-eof"></div>
</footer>
</div>
</article>
<article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="http://yoursite.com/2018/04/16/页面级去重-文本相似度计算/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="name" content="MoriatyC">
<meta itemprop="description" content="">
<meta itemprop="image" content="/images/avatar.gif">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Be Better">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
<a class="post-title-link" href="/2018/04/16/页面级去重-文本相似度计算/" itemprop="url">页面级去重-文本相似度计算</a></h1>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建于" itemprop="dateCreated datePublished" datetime="2018-04-16T21:32:41+08:00">
2018-04-16
</time>
</span>
<span class="post-category" >
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/机器学习/" itemprop="url" rel="index">
<span itemprop="name">机器学习</span>
</a>
</span>
</span>
<span id="/2018/04/16/页面级去重-文本相似度计算/" class="leancloud_visitors" data-flag-title="页面级去重-文本相似度计算">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-eye"></i>
</span>
<span class="post-meta-item-text">阅读次数:</span>
<span class="leancloud-visitors-count"></span>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<h1 id="一-前言"><a href="#一-前言" class="headerlink" title="一. 前言"></a>一. 前言</h1><p>很久没写博客了,这段时间事情实在是多,很多看到的新知识只能草草的记下来没时间整理。今天下午面完阿里爸爸二面,晚上应该不会再突击我了,趁热把下午遇到的一个没解决的问题记录一下。</p>
<p>在爬虫去重这个问题上,单纯使用md5这样的完全比较hash算法不是很精确,因为不可能两个网页完全一样,都会有些稍微的更改,这样我们就需要记录并计算两篇文章的相似性进行查重。文章相似性计算应该属于NLP中的内容,由于我是个机器学习小白,所以我尽量使用工程化的思路来解决这个问题。</p>
<h1 id="二-最小编辑距离"><a href="#二-最小编辑距离" class="headerlink" title="二. 最小编辑距离"></a>二. 最小编辑距离</h1><p>这种方法可能是最简单的方法了,编辑距离(Edit Distance),又称Levenshtein距离,是指两个字串之间,由一个转成另一个所需的最少编辑操作次数。许可的编辑操作包括:</p>
<ol>
<li>将一个字符替换成另一个字符</li>
<li>插入一个字符</li>
<li>删除一个字符。</li>
</ol>
<p>一般来说,编辑距离越小,两个串的相似度越大。然后拿编辑距离去除以两者之间的最大长度,意味着只要变动这么多就可以从A变成B,所以不用变动的字符便代表了相似度。这种方法实现比较简单,在<编程之美>上也有过介绍,主要是通过递归来实现。</p>
<p>两个字符串A,B的编辑距离最大是他们的长度之和,每次操作之后可能有如下6种情况:</p>
<ol>
<li>删除A的第一个字符计算A[2….lenA],B[1….lenB]</li>
<li>删除B的第一个字符计算A[1….lenA],B[2….lenB]</li>
<li>修改A的第一个字符计算A[2….lenA],B[2….lenB]</li>
<li>修改B的第一个字符计算A[2….lenA],B[2….lenB]</li>
<li>插入A的第一个字符到B计算A[1….lenA],B[2….lenB]</li>
<li>插入B的第一个字符到A计算A[2….lenA],B[1….lenB]</li>
</ol>
<p>我们可以将上述的6中情况合并成3种:</p>
<ol>
<li>一步操作后计算A[2….lenA],B[1….lenB]</li>
<li>一步操作后计算A[1….lenA],B[2….lenB]</li>
<li>一步操作后计算A[2….lenA],B[2….lenB]</li>
</ol>
<p>然后每次获得三者中最小的一个再+1进行迭代即可。这种方法最简单也是最容易实现,但是可能每来一篇文章都需要和文档库中的文章进行比较,而且文章过长的话,递归深度也很大。</p>
<h1 id="三-simhash"><a href="#三-simhash" class="headerlink" title="三. simhash"></a>三. simhash</h1><h2 id="1-TF-IDF"><a href="#1-TF-IDF" class="headerlink" title="1. TF-IDF"></a>1. TF-IDF</h2><blockquote>
<p>TF-IDF是一种统计方法,用以评估一字词对于一个文件集或一个语料库中的其中一份文件的重要程度。字词的重要性随着它在文件中出现的次数成正比增加,但同时会随着它在语料库中出现的频率成反比下降。TF-IDF加权的各种形式常被搜索引擎应用,作为文件与用户查询之间相关程度的度量或评级。除了TF-IDF以外,因特网上的搜索引擎还会使用基于链接分析的评级方法,以确定文件在搜寻结果中出现的顺序。</p>
</blockquote>
<p>这是摘自百度百科的解释,简而言之就是一个词语在一篇文章中出现的次数越多说明他越重要,一个词在文档库中出现的次数越少说明他越有代表性,利用这两个值进行一个权值相乘的计算,就能代表一个词语对于一个文章的重要程度。</p>
<h2 id="2-simhash的实现"><a href="#2-simhash的实现" class="headerlink" title="2. simhash的实现"></a>2. simhash的实现</h2><p>simhash是谷歌发明的算法,据说很nb,可以将一个文档转换成64位的字节,然后我们可以通过判断两个字节的汉明距离就知道是否相似了。</p>
<p>在信息论中,两个等长字符串之间的汉明距离是两个字符串对应位置的不同字符的个数。换句话说,它就是将一个字符串变换成另外一个字符串所需要替换的字符个数。例如:</p>
<p>1011101 与 1001001 之间的汉明距离是 2。</p>
<p>首先我们来计算SimHash:</p>
<ol>
<li>提取文档关键词得到[word,weight]这个一个数组。(举例 [美国,4])</li>
<li>用hash算法将word转为固定长度的二进制值的字符串[hash(word),weight]。(举例 [100101,4])</li>
<li>word的hash从左到右与权重相乘,如果为1则乘以1 ,如果是0则曾以-1。(举例4,-4,-4,4,-4,4)</li>
<li>接着计算下个数,直到将所有分词得出的词计算完,然后将每个词第三步得出的数组中的每一个值相加。(举例美国和51区,[4,-4,-4,4,-4,4]和[5 -5 5 -5 5 5]得到[9 -9 1 -1 1 9])</li>
<li>对第四步得到的数组中每一个值进行判断,如果>0记为1,如果<0记为0。(举例[101011])</li>
</ol>
<p><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/simhash.jpg" alt=""><br>第四步得出的就是这个文档的SimHash。</p>
<p>这样我们就能将两个不同长度的文档转换为同样长度的SimHash值,so,我们现在可以计算第一个文档的值和第二个文档的汉明距离(一般<3就是相似度高的)。</p>
<p>SimHash本质上是局部敏感性的hash(如果是两个相似的句子,那么只会有部分不同),和md5之类的不一样。 正因为它的局部敏感性,所以我们可以使用海明距离来衡量SimHash值的相似度。</p>
<p>如果想要小数形式的可以这么做:1 - 汉明距离 / 最长关键词数组长度。</p>
<h1 id="引用"><a href="#引用" class="headerlink" title="引用"></a>引用</h1><blockquote>
<p><编程之美><br><a href="https://blog.csdn.net/chinafire525/article/details/78686876" target="_blank" rel="noopener">https://blog.csdn.net/chinafire525/article/details/78686876</a></p>
</blockquote>
</div>
<footer class="post-footer">
<div class="post-eof"></div>
</footer>
</div>
</article>
<article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="http://yoursite.com/2018/03/10/同步异步和阻塞非阻塞的区别/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="name" content="MoriatyC">
<meta itemprop="description" content="">
<meta itemprop="image" content="/images/avatar.gif">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Be Better">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
<a class="post-title-link" href="/2018/03/10/同步异步和阻塞非阻塞的区别/" itemprop="url">同步异步和阻塞非阻塞的区别</a></h1>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建于" itemprop="dateCreated datePublished" datetime="2018-03-10T17:03:26+08:00">
2018-03-10
</time>
</span>
<span class="post-category" >
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/计算机基础/" itemprop="url" rel="index">
<span itemprop="name">计算机基础</span>
</a>
</span>
</span>
<span id="/2018/03/10/同步异步和阻塞非阻塞的区别/" class="leancloud_visitors" data-flag-title="同步异步和阻塞非阻塞的区别">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-eye"></i>
</span>
<span class="post-meta-item-text">阅读次数:</span>
<span class="leancloud-visitors-count"></span>
</span>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<h1 id="吐槽"><a href="#吐槽" class="headerlink" title="吐槽"></a>吐槽</h1><p>早上五点半胃疼醒了,去医院噼里啪啦检查了一天花了1500啥也没检查出来,继续疼。在排队的时候刷到知乎上的这个问题感觉似懂非懂,这里总结一下看到的几个比较好的解释。</p>
<h1 id="一-I-O模型"><a href="#一-I-O模型" class="headerlink" title="一. I/O模型"></a>一. I/O模型</h1><ul>
<li>阻塞式I/O</li>
<li>非阻塞式I/O</li>
<li>I/O复用(select,poll,epoll…)</li>
<li>信号驱动式I/O(SIGIO)</li>
<li>异步I/O(POSIX的aio_系列函数)</li>
</ul>
<p>一个输入操作一般有两个不同的阶段:</p>
<ol>
<li>等待数据准备好</li>
<li>从内核到进程拷贝数据</li>
</ol>
<p>对于一个套接字上的输入操作,第一步通常涉及等待数据从网络中到达。当所有等待分组到达时,它被复制到内核中的某个缓冲区。第二步就是把数据从内核缓冲区复制到应用程序缓冲区。 </p>
<h2 id="1-阻塞式I-O模型"><a href="#1-阻塞式I-O模型" class="headerlink" title="1. 阻塞式I/O模型"></a>1. 阻塞式I/O模型</h2><p>默认情况下,所有套接字都是阻塞的。下面我们以阻塞套接字的recvfrom的的调用图来说明阻塞。 标红的这部分过程就是阻塞,直到阻塞结束recvfrom才能返回。</p>
<p><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io1.jpg" alt=""></p>
<h2 id="2-非阻塞式I-O"><a href="#2-非阻塞式I-O" class="headerlink" title="2.非阻塞式I/O"></a>2.非阻塞式I/O</h2><p>以下这句话很重要:进程把一个套接字设置成非阻塞是在通知内核,当所请求的I/O操作非得把本进程投入睡眠才能完成时,不要把进程投入睡眠,而是返回一个错误。看看非阻塞的套接字的recvfrom操作如何进行,可以看出recvfrom总是立即返回。</p>
<p><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io2.jpg" alt=""></p>
<h2 id="3-I-O多路复用"><a href="#3-I-O多路复用" class="headerlink" title="3.I/O多路复用"></a>3.I/O多路复用</h2><p>虽然I/O多路复用的函数也是阻塞的,但是其与以上两种还是有不同的,I/O多路复用是阻塞在select,epoll这样的系统调用之上,而没有阻塞在真正的I/O系统调用如recvfrom之上。<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io3.jpg" alt=""></p>
<h2 id="4-信号驱动式I-O"><a href="#4-信号驱动式I-O" class="headerlink" title="4.信号驱动式I/O"></a>4.信号驱动式I/O</h2><p>见得少,直接给出原图<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io4.jpg" alt=""></p>
<h2 id="5-异步I-O"><a href="#5-异步I-O" class="headerlink" title="5. 异步I/O"></a>5. 异步I/O</h2><p>这类函数的工作机制是告知内核启动某个操作,并让内核在整个操作(包括将数据从内核拷贝到用户空间)完成后通知我们。如图所示,注意红线标记处说明在调用时就可以立马返回,等函数操作完成会通知我们。<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io5.jpg" alt=""></p>
<h2 id="区别"><a href="#区别" class="headerlink" title="区别"></a>区别</h2><p>前四种I/O模型都是同步I/O操作,他们的区别在于第一阶段,而他们的第二阶段是一样的:在数据从内核复制到应用缓冲区期间(用户空间),进程阻塞于recvfrom调用。相反,异步I/O模型在这两个阶段都要处理。书上也有一副很好的图解释了<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io6.jpg" alt=""></p>
<ul>
<li>同步I/O操作:导致请求进程阻塞,直到I/O操作完成</li>
<li>异步I/O操作:不导致请求进程阻塞</li>
</ul>
<p>阻塞,非阻塞:进程/线程要访问的数据是否就绪,进程/线程是否需要等待;</p>
<p>同步,异步:访问数据的方式,同步需要主动读写数据,在读写数据的过程中还是会阻塞;异步只需要I/O操作完成的通知,并不主动读写数据,由操作系统内核完成数据的读写。</p>
<p>这里有个来自知乎的例子:<br>老张爱喝茶,废话不说,煮开水。出场人物:老张,水壶两把(普通水壶,简称水壶;会响的水壶,简称响水壶)。</p>
<ol>
<li>老张把水壶放到火上,立等水开。(同步阻塞)老张觉得自己有点傻</li>
<li>老张把水壶放到火上,去客厅看电视,时不时去厨房看看水开没有。(同步非阻塞)老张还是觉得自己有点傻,于是变高端了,买了把会响笛的那种水壶。水开之后,能大声发出嘀~~~~的噪音。</li>
<li>老张把响水壶放到火上,立等水开。(异步阻塞)老张觉得这样傻等意义不大</li>
<li>老张把响水壶放到火上,去客厅看电视,水壶响之前不再去看它了,响了再去拿壶。(异步非阻塞)</li>
</ol>
<p>老张觉得自己聪明了。所谓同步异步,只是对于水壶而言。普通水壶,同步;响水壶,异步。虽然都能干活,但响水壶可以在自己完工之后,提示老张水开了。这是普通水壶所不能及的。同步只能让调用者去轮询自己(情况2中),造成老张效率的低下。所谓阻塞非阻塞,仅仅对于老张而言。立等的老张(被挂起),阻塞;看电视的老张,非阻塞。情况1和情况3中老张就是阻塞的,媳妇喊他都不知道。虽然3中响水壶是异步的,可对于立等的老张没有太大的意义。所以一般异步是配合非阻塞使用的,这样才能发挥异步的效用。</p>
<h1 id="二-IO复用"><a href="#二-IO复用" class="headerlink" title="二. IO复用"></a>二. IO复用</h1><p>这里一开始可能看不出来IO复用比之前阻塞IO有哪些好处,这里也给出一个在知乎上看的解释:</p>
<p>假设你是一个机场的空管, 你需要管理到你机场的所有的航线, 包括进港,出港, 有些航班需要放到停机坪等待,有些航班需要去登机口接乘客。<br>你会怎么做?<br>最简单的做法,就是你去招一大批空管员,然后每人盯一架飞机, 从进港,接客,排位,出港,航线监控,直至交接给下一个空港,全程监控。<br>那么问题就来了:<br>很快你就发现空管塔里面聚集起来一大票的空管员,交通稍微繁忙一点,新的空管员就已经挤不进来了。 空管员之间需要协调,屋子里面就1, 2个人的时候还好,几十号人以后 ,基本上就成菜市场了。空管员经常需要更新一些公用的东西,比如起飞显示屏,比如下一个小时后的出港排期,最后你会很惊奇的发现,每个人的时间最后都花在了抢这些资源上。 现实上我们的空管同时管几十架飞机稀松平常的事情, 他们怎么做的呢?<br>他们用这个东西<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io7.jpg" alt=""></p>
<p>这个东西叫flight progress strip. 每一个块代表一个航班,不同的槽代表不同的状态,然后一个空管员可以管理一组这样的块(一组航班),而他的工作,就是在航班信息有新的更新的时候,把对应的块放到不同的槽子里面。这个东西现在还没有淘汰哦,只是变成电子的了而已。。是不是觉得一下子效率高了很多,一个空管塔里可以调度的航线可以是前一种方法的几倍到几十倍。<br>如果你把每一个航线当成一个Sock(I/O 流), 空管当成你的服务端Sock管理代码的话.第一种方法就是最传统的多进程并发模型 (每进来一个新的I/O流会分配一个新的进程管理。)第二种方法就是I/O多路复用 (单个线程,通过记录跟踪每个I/O流(sock)的状态,来同时管理多个I/O流 。)其实“I/O多路复用”这个坑爹翻译可能是这个概念在中文里面如此难理解的原因。所谓的I/O多路复用在英文中其实叫 I/O multiplexing. 如果你搜索multiplexing啥意思,基本上都会出这个图:<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io8.jpg" alt=""><br>于是大部分人都直接联想到”一根网线,多个sock复用” 这个概念,包括上面的几个回答, 其实不管你用多进程还是I/O多路复用, 网线都只有一根好伐。多个Sock复用一根网线这个功能是在内核+驱动层实现的。<br>重要的事情再说一遍: I/O multiplexing 这里面的 multiplexing 指的其实是在单个线程通过记录跟踪每一个Sock(I/O流)的状态(对应空管塔里面的Fight progress strip槽)来同时管理多个I/O流. 发明它的原因,是尽量多的提高服务器的吞吐能力。 是不是听起来好拗口,看个图就懂了<br><img src="https://raw.githubusercontent.com/MoriatyC/MoriatyC.github.io/master/images/io9.jpg" alt=""><br>在同一个线程里面, 通过拨开关的方式,来同时传输多个I/O流。</p>
<ul>
<li>顺便提一句: </li>
</ul>
<ol>
<li>select和poll都只返回可用集合,线程不安全,区别是select有1024个连接的限制</li>
<li>epoll 现在是线程安全的,返回具体可用的连接,现在不仅告诉你sock组里面数据,还会告诉你具体哪个sock有数据,你不用自己去找了。 </li>
</ol>
<h1 id="引用"><a href="#引用" class="headerlink" title="引用"></a>引用</h1><blockquote>
<p><a href="https://www.zhihu.com/question/19732473" target="_blank" rel="noopener">https://www.zhihu.com/question/19732473</a><br>UNIX网络编程(卷一)<br><a href="https://www.zhihu.com/question/32163005" target="_blank" rel="noopener">https://www.zhihu.com/question/32163005</a></p>
</blockquote>
</div>
<footer class="post-footer">
<div class="post-eof"></div>
</footer>
</div>
</article>
<article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="http://yoursite.com/2018/03/06/类加载机制/">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="name" content="MoriatyC">
<meta itemprop="description" content="">
<meta itemprop="image" content="/images/avatar.gif">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Be Better">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
<a class="post-title-link" href="/2018/03/06/类加载机制/" itemprop="url">类加载机制</a></h1>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建于" itemprop="dateCreated datePublished" datetime="2018-03-06T21:46:41+08:00">
2018-03-06
</time>
</span>
<span class="post-category" >
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/jvm/" itemprop="url" rel="index">
<span itemprop="name">jvm</span>
</a>
</span>
</span>
<span id="/2018/03/06/类加载机制/" class="leancloud_visitors" data-flag-title="类加载机制">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-eye"></i>