玩转 StarRocks on ES 全文检索

发布时间:2025-05-19 00:34

目录

部署ES

下载

Config

系统参数

启动

Verify

SR 外表

测试2:全文检索

插件

ES

SR

部署ES

下载

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.16.2-linux-x86_64.tar.gz

Config

1

2

3

4

sr@cs02:~/app/elasticsearch-7.16.2$grep -v ^# config/elasticsearch.yml

node.name: node-2

network.host: 172.26.194.185

cluster.initial_master_nodes: ["node-2"]

系统参数

1

vm.max_map_count = 655360

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

sr@cs02:~/app/elasticsearch-7.16.2$sudo vim /etc/sysctl.conf

sr@cs02:~/app/elasticsearch-7.16.2$sudo sysctl -p

vm.swappiness = 0

kernel.sysrq = 1

net.ipv4.neigh.default.gc_stale_time = 120

net.ipv4.conf.all.rp_filter = 0

net.ipv4.conf.default.rp_filter = 0

net.ipv4.conf.default.arp_announce = 2

net.ipv4.conf.lo.arp_announce = 2

net.ipv4.conf.all.arp_announce = 2

net.ipv4.tcp_max_tw_buckets = 5000

net.ipv4.tcp_syncookies = 1

net.ipv4.tcp_max_syn_backlog = 1024

net.ipv4.tcp_synack_retries = 2

vm.max_map_count = 655360

启动

1

sr@cs02:~/app/elasticsearch-7.16.2$bin/elasticsearch -d

Verify

1

2

3

sr@cs02:~/app/elasticsearch-7.16.2$sudo netstat -lnpt |grep 9[2-3]00

tcp6       0      0 172.26.194.185:9200     :::*                    LISTEN      10442/java

tcp6       0      0 172.26.194.185:9300     :::*                    LISTEN      10442/java

SR 外表

测试2:全文检索

插件

1

2

3

wget https:

-- 解压到 es/plugin

-- 重启es

ES

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

curl -sH "Content-Type: application/json" -XPUT "cs02:9200/wa"  -d'

{

  "mappings": {

      "properties": {

        "allcol":{

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart",

          "type": "text"

        },

        "name": {

          "type": "text",

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart",         

           "copy_to" : [         

           "allcol"         

           ]

        },

        "site": {

          "type": "text",

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart",         

           "copy_to" : [         

           "allcol"         

           ]

        },

        "comment": {

          "type": "text",

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart",         

           "copy_to" : [         

           "allcol"         

           ]

        }

    }

  },

  "settings": {

    "index": {

      "number_of_replicas": "0",

      "number_of_shards": "1"

    }

  }

}

'

Post

1

2

3

4

5

6

7

8

9

10

curl -XPOST "http://cs02:9200/_bulk" -H 'Content-Type: application/json' -d'

{"index":{"_index":"wa"}}

{ "name": "老谢", "site": "www.starrocks.com","comment": "StarRocks极速MPP数据库"}

{"index":{"_index":"wa"}}

{ "name": "Simon", "site": "www.baidu.com","comment": "Simon accessed www.baidu.com"}

{"index":{"_index":"wa"}}

{ "name": "张三", "site": "www.google.com","comment": "今天天气不错"}

{"index":{"_index":"wa"}}

{ "name": "李四", "site": "docs.starrocks.com","comment": "李四比较懒,啥也没说"}

'

Search

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

sr@cs01:~$curl -s -XGET cs02:9200/wa/_search?pretty

{

  "took" : 818,

  "timed_out" : false,

  "_shards" : {

    "total" : 1,

    "successful" : 1,

    "skipped" : 0,

    "failed" : 0

  },

  "hits" : {

    "total" : {

      "value" : 4,

      "relation" : "eq"

    },

    "max_score" : 1.0,

    "hits" : [

      {

        "_index" : "wa",

        "_type" : "_doc",

        "_id" : "w27USX4B8p-MWLQe7Rj7",

        "_score" : 1.0,

        "_source" : {

          "name" : "老谢",

          "site" : "www.starrocks.com",

          "comment" : "StarRocks极速MPP数据库"

        }

      },

      {

        "_index" : "wa",

        "_type" : "_doc",

        "_id" : "xG7USX4B8p-MWLQe7Rj7",

        "_score" : 1.0,

        "_source" : {

          "name" : "Simon",

          "site" : "www.baidu.com",

          "comment" : "Simon accessed www.baidu.com"

        }

      },

      {

        "_index" : "wa",

        "_type" : "_doc",

        "_id" : "xW7USX4B8p-MWLQe7Rj7",

        "_score" : 1.0,

        "_source" : {

          "name" : "张三",

          "site" : "www.google.com",

          "comment" : "今天天气不错"

        }

      },

      {

        "_index" : "wa",

        "_type" : "_doc",

        "_id" : "xm7USX4B8p-MWLQe7Rj7",

        "_score" : 1.0,

        "_source" : {

          "name" : "李四",

          "site" : "docs.starrocks.com",

          "comment" : "李四比较懒,啥也没说"

        }

      }

    ]

  }

}

curl -sH "Content-Type: application/json" -XGET cs02:9200/wa/_search?pretty -d '

{

      "query":{

          "term":{

          "allcol":

              "比较" }

          }

 }'

sr@cs01:~/soft$curl -sH "Content-Type: application/json" -XGET cs02:9200/wa/_search?pretty -d '

> {

>       "query":{

>           "term":{

>           "allcol":

>               "比较" }

>           }

>  }'

{

  "took" : 3,

  "timed_out" : false,

  "_shards" : {

    "total" : 1,

    "successful" : 1,

    "skipped" : 0,

    "failed" : 0

  },

  "hits" : {

    "total" : {

      "value" : 1,

      "relation" : "eq"

    },

    "max_score" : 1.1429836,

    "hits" : [

      {

        "_index" : "wa",

        "_type" : "_doc",

        "_id" : "ImUCSn4BHW9CsLnJXeFp",

        "_score" : 1.1429836,

        "_source" : {

          "name" : "李四",

          "site" : "docs.starrocks.com",

          "comment" : "李四比较懒,啥也没说"

        }

      }

    ]

  }

}

SR

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

USE simon;

-- 全文检索

CREATE EXTERNAL TABLE `soe_t2` (

 `allcol` varchar(65530) NULL COMMENT "全文检索字段",

 `name` varchar(65530) NULL COMMENT "",

 `site` varchar(65530) NULL COMMENT "",

 `comment` varchar(65530) NULL COMMENT ""

) ENGINE=ELASTICSEARCH

COMMENT "ELASTICSEARCH"

PROPERTIES (

"hosts" = "cs02:9200",

"index" = "wa",

"type" = "_doc",

"transport" = "http",

"enable_docvalue_scan" = "true",

"max_docvalue_fields" = "20",

"enable_keyword_sniff" = "true"

);

1

2

3

4

5

6

7

8

9

10

mysql> select * from soe_t2;

+--------+--------+--------------------+--------------------------------+

| allcol | name   | site               | comment                        |

+--------+--------+--------------------+--------------------------------+

| NULL   | 老谢   | www.starrocks.com  | StarRocks极速MPP数据库            |

| NULL   | Simon  | www.baidu.com      | Simon accessed www.baidu.com   |

| NULL   | 张三   | www.google.com     | 今天天气不错                   |

| NULL   | 李四   | docs.starrocks.com | 李四比较懒,啥也没说           |

+--------+--------+--------------------+--------------------------------+

4 rows in set (0.01 sec)

NOTE: 如果用standard分词器,英文的可以查任意字段,但是中文不行

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

-- www未按预期分词

mysql> select * from soe_t2 where allcol = 'www';

Empty set (0.01 sec)

-- 标准分词器,中文不大行

mysql> select * from soe_t2 where allcol = '张三';

Empty set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'simon';

+--------+-------+---------------+------------------------------+

| allcol | name  | site          | comment                      |

+--------+-------+---------------+------------------------------+

| NULL   | Simon | www.baidu.com | Simon accessed www.baidu.com |

+--------+-------+---------------+------------------------------+

1 row in set (0.33 sec)

mysql> select * from soe_t2 where allcol = 'baidu';

Empty set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'accessed';

+--------+-------+---------------+------------------------------+

| allcol | name  | site          | comment                      |

+--------+-------+---------------+------------------------------+

| NULL   | Simon | www.baidu.com | Simon accessed www.baidu.com |

+--------+-------+---------------+------------------------------+

1 row in set (0.01 sec)

-- docs.starrocks.com, 没有按预期的分词出 docs

mysql> select * from soe_t2 where allcol = 'docs';

Empty set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'docs.starrocks.com';

+--------+--------+--------------------+--------------------------------+

| allcol | name   | site               | comment                        |

+--------+--------+--------------------+--------------------------------+

| NULL   | 李四   | docs.starrocks.com | 李四比较懒,啥也没说           |

+--------+--------+--------------------+--------------------------------+

1 row in set (0.01 sec)

使用IK分词器,SR on ES利用copy_to allcol字段,实现全文检索效果!

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

mysql> select * from soe_t2 where allcol = 'docs';

+--------+--------+--------------------+--------------------------------+

| allcol | name   | site               | comment                        |

+--------+--------+--------------------+--------------------------------+

| NULL   | 李四   | docs.starrocks.com | 李四比较懒,啥也没说           |

+--------+--------+--------------------+--------------------------------+

1 row in set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'baidu';

+--------+-------+---------------+------------------------------+

| allcol | name  | site          | comment                      |

+--------+-------+---------------+------------------------------+

| NULL   | Simon | www.baidu.com | Simon accessed www.baidu.com |

+--------+-------+---------------+------------------------------+

1 row in set (0.02 sec)

mysql> select * from soe_t2 where allcol = '张三';

+--------+--------+----------------+--------------------+

| allcol | name   | site           | comment            |

+--------+--------+----------------+--------------------+

| NULL   | 张三   | www.google.com | 今天天气不错       |

+--------+--------+----------------+--------------------+

1 row in set (0.01 sec)

mysql> select * from soe_t2 where allcol = 'www';

+--------+--------+-------------------+------------------------------+

| allcol | name   | site              | comment                      |

+--------+--------+-------------------+------------------------------+

| NULL   | 老谢   | www.starrocks.com | StarRocks极速MPP数据库          |

| NULL   | Simon  | www.baidu.com     | Simon accessed www.baidu.com |

| NULL   | 张三   | www.google.com    | 今天天气不错                 |

+--------+--------+-------------------+------------------------------+

3 rows in set (0.75 sec)

上一篇:

玩转StarRocks on ES-1-分词

网址:玩转 StarRocks on ES 全文检索 https://m.mxgxt.com/news/view/1271941

相关内容

StarRocks Summit 2023 技术交流峰会圆满落幕
StarRocks 相关面试题
滴滴 x StarRocks:极速多维分析创造更大的业务价值
实时计算 Flink版产品使用问题之如何对starrocks进行分桶
EMR Serverless StarRocks 版
StarRocks 1.19 新版本特性介绍
StarRocks数据同步工具
StarRocks Summit Asia 2024落幕,Lakehouse引领数据技术新趋势
StarRocks 2024 数据技术峰会圆满收官,Lakehouse引领数据技术新趋势
再见,ClickHouse!2022我选StarRocks 2.0!

随便看看