Flink DataStream分流、合流

Flink 专栏收录该内容
29 篇文章 20 订阅

本文总结Flink中非常有用的功能,分流和合流。

分流(Split/Side)

分流可以将一个流拆分成多个流。

基于Split...Select...

package com.bigdata.flink;

import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.ArrayList;

/**
 * Author: Wang Pei
 * Summary:
 *  分流:基于Split-Select
 */
@Slf4j
public class SplitStreamBySplit {
    public static void main(String[] args) throws Exception{

        /**运行环境*/
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        /**输入数据源*/
        DataStreamSource<Tuple3<String, String, String>> source = env.fromElements(
                new Tuple3<>("productID1", "click", "user_1"),
                new Tuple3<>("productID1", "click", "user_2"),
                new Tuple3<>("productID1", "browse", "user_1"),
                new Tuple3<>("productID2", "browse", "user_1"),
                new Tuple3<>("productID2", "click", "user_2"),
                new Tuple3<>("productID2", "click", "user_1")
        );

        /**1、定义拆分逻辑*/
        SplitStream<Tuple3<String, String, String>> splitStream = source.split(new OutputSelector<Tuple3<String, String, String>>() {
            @Override
            public Iterable<String> select(Tuple3<String, String, String> value) {

                ArrayList<String> output = new ArrayList<>();
                if (value.f0.equals("productID1")) {
                    output.add("productID1");

                } else if (value.f0.equals("productID2")) {
                    output.add("productID2");
                }

                return output;

            }
        });

        /**2、将流真正拆分出来*/
        splitStream.select("productID1").print();

        env.execute();
    }
}

注意:

  1. Split...Select...Split只是对流中的数据打上标记,并没有将流真正拆分。可通过Select算子将流真正拆分出来。
  2. Split...Select...不能连续分流。即不能Split...Select...Split,但可以如Split...Select...Filter...Split
  3. Split...Select...已经过时,推荐使用更灵活的侧路输出(Side-Output),如下。

基于Side-Output

package com.bigdata.flink;

import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;


/**
 * Author: Wang Pei
 * Summary:
 *  分流:基于SideOutput(侧路输出)
 */
@Slf4j
public class SplitStreamBySideOutput {
    public static void main(String[] args) throws Exception{

        /**运行环境*/
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        /**输入数据源*/
        DataStreamSource<Tuple3<String, String, String>> source = env.fromElements(
                new Tuple3<>("productID1", "click", "user_1"),
                new Tuple3<>("productID1", "click", "user_2"),
                new Tuple3<>("productID1", "browse", "user_1"),
                new Tuple3<>("productID2", "browse", "user_1"),
                new Tuple3<>("productID2", "click", "user_2"),
                new Tuple3<>("productID2", "click", "user_1")
        );

        /**1、定义OutputTag*/
        OutputTag<Tuple3<String, String, String>> sideOutputTag = new OutputTag<Tuple3<String, String, String>>("side-output-tag"){};

        /**2、在ProcessFunction中处理主流和分流*/
        SingleOutputStreamOperator<Tuple3<String, String, String>> processedStream = source.process(new ProcessFunction<Tuple3<String, String, String>, Tuple3<String, String, String>>() {
            @Override
            public void processElement(Tuple3<String, String, String> value, Context ctx, Collector<Tuple3<String, String, String>> out) throws Exception {

                //侧流-只输出特定数据
                if (value.f0.equals("productID1")) {
                    ctx.output(sideOutputTag, value);

                //主流
                }else {
                    out.collect(value);
                }

            }
        });

        //获取主流
        processedStream.print();
        //获取侧流
        processedStream.getSideOutput(sideOutputTag).print();

        env.execute();
    }
}

注意:

  1. Side-Output是从Flink 1.3.0开始提供的功能,支持了更灵活的多路输出。
  2. Side-Output可以以侧流的形式,以不同于主流的数据类型,向下游输出指定条件的数据、异常数据、迟到数据等等。
  3. Side-Output通过ProcessFunction将数据发送到侧路OutputTag

合流(Union/Connect)

合流可以将多个流合并成一个流。

基于Union

package com.bigdata.flink;

import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * Author: Wang Pei
 * Summary:
 *  合流:基于Union
 */
@Slf4j
public class UnionStreamByUnion {
    public static void main(String[] args) throws Exception{

        /**运行环境*/
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        /**输入数据源source1*/
        DataStreamSource<Tuple3<String, String, String>> source1 = env.fromElements(
                new Tuple3<>("productID1", "click", "user_1")
        );

        /**输入数据源source2*/
        DataStreamSource<Tuple3<String, String, String>> source2 = env.fromElements(
                new Tuple3<>("productID3", "click", "user_1"),
                new Tuple3<>("productID3", "click", "user_2")
        );

        /**输入数据源source3*/
        DataStreamSource<Tuple3<String, String, String>> source3 = env.fromElements(
                new Tuple3<>("productID2", "browse", "user_1"),
                new Tuple3<>("productID2", "click", "user_2"),
                new Tuple3<>("productID2", "click", "user_1")
        );

        /**合并流*/
        source1.union(source2,source3).print();

        env.execute();
    }
}

注意:

  1. Union可以将两个或多个同数据类型的流合并成一个流。

基于Connect

package com.bigdata.flink;

import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;

/**
 * Author: Wang Pei
 * Summary:
 *  合流:基于Connect
 */
@Slf4j
public class UnionStreamByConnect {
    public static void main(String[] args) throws Exception{

        /**运行环境*/
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        /**输入数据源source1*/
        DataStreamSource<Tuple3<String, String, String>> source1 = env.fromElements(
                new Tuple3<>("productID1", "click", "user_1")
        );

        /**输入数据源source2*/
        DataStreamSource<String> source2 = env.fromElements(
                "productID3:click:user_1",
                "productID3:browse:user_2"
        );

        /**1、合并流*/
        ConnectedStreams<Tuple3<String, String, String>, String> connectedStream = source1.connect(source2);

        /**2、用CoMap处理合并后的流*/
        SingleOutputStreamOperator<Tuple2<String, String>> resultStream = connectedStream.map(new CoMapFunction<Tuple3<String, String, String>, String, Tuple2<String, String>>() {

            //定义第一个流的处理逻辑
            @Override
            public Tuple2<String, String> map1(Tuple3<String, String, String> value) throws Exception {
                return new Tuple2<>(value.f1, value.f2);
            }

            //定义第二个流的处理逻辑
            @Override
            public Tuple2<String, String> map2(String value) throws Exception {
                String[] valueSplit = value.split(":");
                return new Tuple2<>(valueSplit[1], valueSplit[2]);
            }
        });

        resultStream.print();

        env.execute();
    }
}

注意:

  1. Connect可以用来合并两种不同类型的流。
  2. Connect合并后,可用map中的CoMapFunctionflatMap中的CoFlatMapFunction来对合并流中的每个流进行处理。
  • 4
    点赞
  • 0
    评论
  • 18
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

相关推荐
©️2020 CSDN 皮肤主题: 精致技术 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值