玩转 StarRocks on ES 全文检索
目录
部署ES
下载
Config
系统参数
启动
Verify
SR 外表
测试2:全文检索
插件
ES
SR
部署ES
下载
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.16.2-linux-x86_64.tar.gz
Config
1
2
3
4
sr@cs02:~/app/elasticsearch-7.16.2$grep -v ^# config/elasticsearch.yml
node.name: node-2
network.host: 172.26.194.185
cluster.initial_master_nodes: ["node-2"]
系统参数
1
vm.max_map_count = 655360
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
sr@cs02:~/app/elasticsearch-7.16.2$sudo vim /etc/sysctl.conf
sr@cs02:~/app/elasticsearch-7.16.2$sudo sysctl -p
vm.swappiness = 0
kernel.sysrq = 1
net.ipv4.neigh.default.gc_stale_time = 120
net.ipv4.conf.all.rp_filter = 0
net.ipv4.conf.default.rp_filter = 0
net.ipv4.conf.default.arp_announce = 2
net.ipv4.conf.lo.arp_announce = 2
net.ipv4.conf.all.arp_announce = 2
net.ipv4.tcp_max_tw_buckets = 5000
net.ipv4.tcp_syncookies = 1
net.ipv4.tcp_max_syn_backlog = 1024
net.ipv4.tcp_synack_retries = 2
vm.max_map_count = 655360
启动
1
sr@cs02:~/app/elasticsearch-7.16.2$bin/elasticsearch -d
Verify
1
2
3
sr@cs02:~/app/elasticsearch-7.16.2$sudo netstat -lnpt |grep 9[2-3]00
tcp6 0 0 172.26.194.185:9200 :::* LISTEN 10442/java
tcp6 0 0 172.26.194.185:9300 :::* LISTEN 10442/java
SR 外表
测试2:全文检索
插件
1
2
3
wget https:
-- 解压到 es/plugin
-- 重启es
ES
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
curl -sH "Content-Type: application/json" -XPUT "cs02:9200/wa" -d'
{
"mappings": {
"properties": {
"allcol":{
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"type": "text"
},
"name": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"copy_to" : [
"allcol"
]
},
"site": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"copy_to" : [
"allcol"
]
},
"comment": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"copy_to" : [
"allcol"
]
}
}
},
"settings": {
"index": {
"number_of_replicas": "0",
"number_of_shards": "1"
}
}
}
'
Post
1
2
3
4
5
6
7
8
9
10
curl -XPOST "http://cs02:9200/_bulk" -H 'Content-Type: application/json' -d'
{"index":{"_index":"wa"}}
{ "name": "老谢", "site": "www.starrocks.com","comment": "StarRocks极速MPP数据库"}
{"index":{"_index":"wa"}}
{ "name": "Simon", "site": "www.baidu.com","comment": "Simon accessed www.baidu.com"}
{"index":{"_index":"wa"}}
{ "name": "张三", "site": "www.google.com","comment": "今天天气不错"}
{"index":{"_index":"wa"}}
{ "name": "李四", "site": "docs.starrocks.com","comment": "李四比较懒,啥也没说"}
'
Search
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
sr@cs01:~$curl -s -XGET cs02:9200/wa/_search?pretty
{
"took" : 818,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "wa",
"_type" : "_doc",
"_id" : "w27USX4B8p-MWLQe7Rj7",
"_score" : 1.0,
"_source" : {
"name" : "老谢",
"site" : "www.starrocks.com",
"comment" : "StarRocks极速MPP数据库"
}
},
{
"_index" : "wa",
"_type" : "_doc",
"_id" : "xG7USX4B8p-MWLQe7Rj7",
"_score" : 1.0,
"_source" : {
"name" : "Simon",
"site" : "www.baidu.com",
"comment" : "Simon accessed www.baidu.com"
}
},
{
"_index" : "wa",
"_type" : "_doc",
"_id" : "xW7USX4B8p-MWLQe7Rj7",
"_score" : 1.0,
"_source" : {
"name" : "张三",
"site" : "www.google.com",
"comment" : "今天天气不错"
}
},
{
"_index" : "wa",
"_type" : "_doc",
"_id" : "xm7USX4B8p-MWLQe7Rj7",
"_score" : 1.0,
"_source" : {
"name" : "李四",
"site" : "docs.starrocks.com",
"comment" : "李四比较懒,啥也没说"
}
}
]
}
}
curl -sH "Content-Type: application/json" -XGET cs02:9200/wa/_search?pretty -d '
{
"query":{
"term":{
"allcol":
"比较" }
}
}'
sr@cs01:~/soft$curl -sH "Content-Type: application/json" -XGET cs02:9200/wa/_search?pretty -d '
> {
> "query":{
> "term":{
> "allcol":
> "比较" }
> }
> }'
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.1429836,
"hits" : [
{
"_index" : "wa",
"_type" : "_doc",
"_id" : "ImUCSn4BHW9CsLnJXeFp",
"_score" : 1.1429836,
"_source" : {
"name" : "李四",
"site" : "docs.starrocks.com",
"comment" : "李四比较懒,啥也没说"
}
}
]
}
}
SR
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
USE simon;
-- 全文检索
CREATE EXTERNAL TABLE `soe_t2` (
`allcol` varchar(65530) NULL COMMENT "全文检索字段",
`name` varchar(65530) NULL COMMENT "",
`site` varchar(65530) NULL COMMENT "",
`comment` varchar(65530) NULL COMMENT ""
) ENGINE=ELASTICSEARCH
COMMENT "ELASTICSEARCH"
PROPERTIES (
"hosts" = "cs02:9200",
"index" = "wa",
"type" = "_doc",
"transport" = "http",
"enable_docvalue_scan" = "true",
"max_docvalue_fields" = "20",
"enable_keyword_sniff" = "true"
);
1
2
3
4
5
6
7
8
9
10
mysql> select * from soe_t2;
+--------+--------+--------------------+--------------------------------+
| allcol | name | site | comment |
+--------+--------+--------------------+--------------------------------+
| NULL | 老谢 | www.starrocks.com | StarRocks极速MPP数据库 |
| NULL | Simon | www.baidu.com | Simon accessed www.baidu.com |
| NULL | 张三 | www.google.com | 今天天气不错 |
| NULL | 李四 | docs.starrocks.com | 李四比较懒,啥也没说 |
+--------+--------+--------------------+--------------------------------+
4 rows in set (0.01 sec)
NOTE: 如果用standard分词器,英文的可以查任意字段,但是中文不行
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
-- www未按预期分词
mysql> select * from soe_t2 where allcol = 'www';
Empty set (0.01 sec)
-- 标准分词器,中文不大行
mysql> select * from soe_t2 where allcol = '张三';
Empty set (0.01 sec)
mysql> select * from soe_t2 where allcol = 'simon';
+--------+-------+---------------+------------------------------+
| allcol | name | site | comment |
+--------+-------+---------------+------------------------------+
| NULL | Simon | www.baidu.com | Simon accessed www.baidu.com |
+--------+-------+---------------+------------------------------+
1 row in set (0.33 sec)
mysql> select * from soe_t2 where allcol = 'baidu';
Empty set (0.01 sec)
mysql> select * from soe_t2 where allcol = 'accessed';
+--------+-------+---------------+------------------------------+
| allcol | name | site | comment |
+--------+-------+---------------+------------------------------+
| NULL | Simon | www.baidu.com | Simon accessed www.baidu.com |
+--------+-------+---------------+------------------------------+
1 row in set (0.01 sec)
-- docs.starrocks.com, 没有按预期的分词出 docs
mysql> select * from soe_t2 where allcol = 'docs';
Empty set (0.01 sec)
mysql> select * from soe_t2 where allcol = 'docs.starrocks.com';
+--------+--------+--------------------+--------------------------------+
| allcol | name | site | comment |
+--------+--------+--------------------+--------------------------------+
| NULL | 李四 | docs.starrocks.com | 李四比较懒,啥也没说 |
+--------+--------+--------------------+--------------------------------+
1 row in set (0.01 sec)
使用IK分词器,SR on ES利用copy_to allcol字段,实现全文检索效果!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
mysql> select * from soe_t2 where allcol = 'docs';
+--------+--------+--------------------+--------------------------------+
| allcol | name | site | comment |
+--------+--------+--------------------+--------------------------------+
| NULL | 李四 | docs.starrocks.com | 李四比较懒,啥也没说 |
+--------+--------+--------------------+--------------------------------+
1 row in set (0.01 sec)
mysql> select * from soe_t2 where allcol = 'baidu';
+--------+-------+---------------+------------------------------+
| allcol | name | site | comment |
+--------+-------+---------------+------------------------------+
| NULL | Simon | www.baidu.com | Simon accessed www.baidu.com |
+--------+-------+---------------+------------------------------+
1 row in set (0.02 sec)
mysql> select * from soe_t2 where allcol = '张三';
+--------+--------+----------------+--------------------+
| allcol | name | site | comment |
+--------+--------+----------------+--------------------+
| NULL | 张三 | www.google.com | 今天天气不错 |
+--------+--------+----------------+--------------------+
1 row in set (0.01 sec)
mysql> select * from soe_t2 where allcol = 'www';
+--------+--------+-------------------+------------------------------+
| allcol | name | site | comment |
+--------+--------+-------------------+------------------------------+
| NULL | 老谢 | www.starrocks.com | StarRocks极速MPP数据库 |
| NULL | Simon | www.baidu.com | Simon accessed www.baidu.com |
| NULL | 张三 | www.google.com | 今天天气不错 |
+--------+--------+-------------------+------------------------------+
3 rows in set (0.75 sec)
上一篇:
玩转StarRocks on ES-1-分词
网址:玩转 StarRocks on ES 全文检索 https://m.mxgxt.com/news/view/1271941
相关内容
StarRocks Summit 2023 技术交流峰会圆满落幕StarRocks 相关面试题
滴滴 x StarRocks:极速多维分析创造更大的业务价值
实时计算 Flink版产品使用问题之如何对starrocks进行分桶
EMR Serverless StarRocks 版
StarRocks 1.19 新版本特性介绍
StarRocks数据同步工具
StarRocks Summit Asia 2024落幕,Lakehouse引领数据技术新趋势
StarRocks 2024 数据技术峰会圆满收官,Lakehouse引领数据技术新趋势
再见,ClickHouse!2022我选StarRocks 2.0!