Flink Table & SQL 双流Join

Flink 专栏收录该内容
29 篇文章 20 订阅

本文总结Flink Table & SQL 中的双流Join。

  1. Regular Join

  2. Interval Join

  3. Window Join

Regular Join

Regular Join: 常规Join。

在使用时注意以下几点:

  1. 默认情况下,需要将两个流的输入全部保存在State中。为限制状态无限增长,可通过Query Configuration设置空闲状态保留时长。

  2. 目前仅支持等值连接。

  3. Outer Join(Left Join/Right Join/Full Join)会产生ADD/RETRACT消息,即RetractStream。在输出时注意一下。

代码示例

package com.bigdata.flink.streamJoin.regularJoin;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

/**
 * Author: Wang Pei
 * Summary:
 * Regular Join
 */
public class RegularJoin {
    public static void main(String[] args) throws Exception {

        EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
        StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);

        // 流1
        // 浏览流 {"userID":"user_1","eventType":"browse","eventTime":"2015-01-01 00:00:00"}
        String kafkaBrowse = ""
                + "CREATE TABLE kafka_browse_log "
                + "( "
                + "    userID STRING, "
                + "    eventType STRING, "
                + "    eventTime STRING, "
                + "    proctime as PROCTIME() "
                + ") WITH ( "
                + "    'connector.type' = 'kafka', "
                + "    'connector.version' = '0.10', "
                + "    'connector.properties.bootstrap.servers' = 'kafka01:9092', "
                + "    'connector.properties.zookeeper.connect' = 'kafka01:2181', "
                + "    'connector.topic' = 'topic_1', "
                + "    'connector.properties.group.id' = 'c1', "
                + "    'connector.startup-mode' = 'latest-offset', "
                + "    'format.type' = 'json' "
                + ")";
        tableEnv.sqlUpdate(kafkaBrowse);
        tableEnv.toAppendStream(tableEnv.from("kafka_browse_log"), Row.class).print();

        // 流2
        // 用户基础信息变化流 {"userID":"user_1","userName":"name1","userAge":10,"userAddress":"Mars"}
        String kafkaUser = ""
                + "CREATE TABLE kafka_user_change_log "
                + "( "
                + "    userID STRING, "
                + "    userName STRING, "
                + "    userAge INT, "
                + "    userAddress STRING "
                + ") WITH ( "
                + "    'connector.type' = 'kafka', "
                + "    'connector.version' = '0.10', "
                + "    'connector.properties.bootstrap.servers' = 'kafka01:9092', "
                + "    'connector.properties.zookeeper.connect' = 'kafka01:2181', "
                + "    'connector.topic' = 'topic_2', "
                + "    'connector.properties.group.id' = 'c1', "
                + "    'connector.startup-mode' = 'latest-offset', "
                + "    'format.type' = 'json' "
                + ")";
        tableEnv.sqlUpdate(kafkaUser);
        tableEnv.toAppendStream(tableEnv.from("kafka_user_change_log"), Row.class).print();

        // INNER JOIN
        //String execSQL = ""
        //        + "SELECT * "
        //        + "FROM kafka_browse_log "
        //        + "INNER JOIN kafka_user_change_log "
        //        + "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
        //tableEnv.toAppendStream(tableEnv.sqlQuery(execSQL), Row.class).print();

        // LEFT JOIN
        //String execSQL = ""
        //        + "SELECT * "
        //        + "FROM kafka_browse_log "
        //        + "LEFT JOIN kafka_user_change_log "
        //        + "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
        //tableEnv.toRetractStream(tableEnv.sqlQuery(execSQL), Row.class).print();

        // RIGHT JOIN
        //String execSQL = ""
        //        + "SELECT * "
        //        + "FROM kafka_browse_log "
        //        + "RIGHT JOIN kafka_user_change_log "
        //        + "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
        //tableEnv.toRetractStream(tableEnv.sqlQuery(execSQL), Row.class).print();

        // FULL JOIN
        String execSQL = ""
                + "SELECT * "
                + "FROM kafka_browse_log "
                + "FULL JOIN kafka_user_change_log "
                + "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
        tableEnv.toRetractStream(tableEnv.sqlQuery(execSQL), Row.class).print();

        tableEnv.execute(RegularJoin.class.getSimpleName());
    }
}

Interval Join

Interval Join: 间隔连接。场景示例: 一个流Join另一个流一段相对时间内的数据。

在使用时注意以下几点:

  1. 支持Process Time和Event Time。

  2. 对超出时间范围的数据会自动清除,避免State过大。

  3. 支持非等值连接。如>=><=<BETWEEN ... AND ...

  4. Flink DataStream 基于Interval Join实时Join过去一段时间内的数据

代码示例

package com.bigdata.flink.streamJoin.regularJoin;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

/**
 * Author: Wang Pei
 * Summary:
 * Interval Join
 */
public class IntervalJoin {
    public static void main(String[] args) throws Exception {

        EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
        StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);

        // 流1
        // 浏览流 {"userID":"user_1","eventType":"browse","eventTime":"2015-01-01 00:00:00"}
        String kafkaBrowse = ""
                + "CREATE TABLE kafka_browse_log "
                + "( "
                + "    userID STRING, "
                + "    eventType STRING, "
                + "    eventTime STRING, "
                + "    proctime as PROCTIME() "
                + ") WITH ( "
                + "    'connector.type' = 'kafka', "
                + "    'connector.version' = '0.10', "
                + "    'connector.properties.bootstrap.servers' = 'kafka01:9092', "
                + "    'connector.properties.zookeeper.connect' = 'kafka01:2181', "
                + "    'connector.topic' = 'topic_1', "
                + "    'connector.properties.group.id' = 'c1', "
                + "    'connector.startup-mode' = 'latest-offset', "
                + "    'format.type' = 'json' "
                + ")";
        tableEnv.sqlUpdate(kafkaBrowse);
        tableEnv.toAppendStream(tableEnv.from("kafka_browse_log"), Row.class).print();

        // 流2
        // 用户基础信息变化流 {"userID":"user_1","userName":"name1","userAge":10,"userAddress":"Mars"}
        String kafkaUser = ""
                + "CREATE TABLE kafka_user_change_log "
                + "( "
                + "    userID STRING, "
                + "    userName STRING, "
                + "    userAge INT, "
                + "    userAddress STRING, "
                + "    proctime as PROCTIME() "
                + ") WITH ( "
                + "    'connector.type' = 'kafka', "
                + "    'connector.version' = '0.10', "
                + "    'connector.properties.bootstrap.servers' = 'kafka01:9092', "
                + "    'connector.properties.zookeeper.connect' = 'kafka01:2181', "
                + "    'connector.topic' = 'topic_2', "
                + "    'connector.properties.group.id' = 'c1', "
                + "    'connector.startup-mode' = 'latest-offset', "
                + "    'format.type' = 'json' "
                + ")";
        tableEnv.sqlUpdate(kafkaUser);
        tableEnv.toAppendStream(tableEnv.from("kafka_user_change_log"), Row.class).print();

        // Interval Join
        String execSQL = ""
                + "SELECT * "
                + "FROM kafka_browse_log browse, "
                + "     kafka_user_change_log `user` "
                + "WHERE  "
                + "     browse.userID = `user`.userID "
                + "     AND `user`.proctime BETWEEN browse.proctime - INTERVAL '30' SECOND AND browse.proctime";
        tableEnv.toAppendStream(tableEnv.sqlQuery(execSQL), Row.class).print();

        tableEnv.execute(IntervalJoin.class.getSimpleName());
    }
}

Window Join

Window Join: 两个流相同Key,且相同窗口内的数据做Join。

在使用时注意以下几点:

  1. 还没找到Flink DataStream中的Window Join(Tumbling Window Join/Sliding Window Join/Session Window Join)好的SQL化表达。

  2. Flink DataStream中的Window Join参考:

    A. Flink DataStream Window Join: Inner Join、Left Join、Right Join

    B. Flink DataStream Window Join实现与示例

  • 0
    点赞
  • 1
    评论
  • 14
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

相关推荐
©️2020 CSDN 皮肤主题: 精致技术 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值