From d7476f60e581cdc44c2d5c0b55b4fc6e3e54df5c Mon Sep 17 00:00:00 2001 From: psainics Date: Wed, 9 Apr 2025 11:26:48 +0530 Subject: [PATCH 1/4] Make referenceName required --- src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java b/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java index f4a5a92..1454aab 100644 --- a/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java +++ b/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java @@ -59,8 +59,6 @@ public class FTPConfig extends PluginConfig implements FileSourceProperties { }.getType(); private static final List LOCATION_PROPERTIES = Arrays.asList("type", "host", "path", "user", "password"); - @Macro - @Nullable @Description("Name be used to uniquely identify this source for lineage, annotating metadata, etc.") private final String referenceName; From 2eeaf73a4d375d60bee51fe966b6916b58ea5033 Mon Sep 17 00:00:00 2001 From: psainics Date: Wed, 9 Apr 2025 12:55:32 +0530 Subject: [PATCH 2/4] Fix NPE in getFileSystemProperties --- .../java/io/cdap/plugin/batch/source/ftp/FTPBatchSource.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/cdap/plugin/batch/source/ftp/FTPBatchSource.java b/src/main/java/io/cdap/plugin/batch/source/ftp/FTPBatchSource.java index 640f1b2..1aaab67 100644 --- a/src/main/java/io/cdap/plugin/batch/source/ftp/FTPBatchSource.java +++ b/src/main/java/io/cdap/plugin/batch/source/ftp/FTPBatchSource.java @@ -73,7 +73,9 @@ protected Map getFileSystemProperties(@Nullable BatchSourceConte failureCollector.getOrThrowException(); } Map properties = new HashMap<>(config.getFileSystemProperties()); - properties.putAll(location.getHadoopProperties()); + if (location != null) { + properties.putAll(location.getHadoopProperties()); + } return properties; } From c14bfea97784edc3c784f8cecc37afac867b1440 Mon Sep 17 00:00:00 2001 From: psainics Date: Thu, 1 May 2025 15:26:05 +0530 Subject: [PATCH 3/4] Update cdap.version to 6.11.0 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index c7f1e13..b6df667 100644 --- a/pom.xml +++ b/pom.xml @@ -36,8 +36,8 @@ system:cdap-data-pipeline[6.11.0-SNAPSHOT,7.0.0-SNAPSHOT) system:cdap-data-streams[6.11.0-SNAPSHOT,7.0.0-SNAPSHOT) - 6.11.0-SNAPSHOT - 2.13.0-SNAPSHOT + 6.11.0 + 2.13.0 3.3.6 2.6 4.11 From d79d33020b4da8ff33c6c3f8a6f632767328aba3 Mon Sep 17 00:00:00 2001 From: psainics Date: Thu, 1 May 2025 15:59:32 +0530 Subject: [PATCH 4/4] Adding XLS UI elements for ftp source --- docs/FTPSource-batchsource.md | 19 +++- .../plugin/batch/source/ftp/FTPConfig.java | 38 ++++++- widgets/FTPSource-batchsource.json | 105 +++++++++++++++++- 3 files changed, 156 insertions(+), 6 deletions(-) diff --git a/docs/FTPSource-batchsource.md b/docs/FTPSource-batchsource.md index 56d59b0..8e29458 100644 --- a/docs/FTPSource-batchsource.md +++ b/docs/FTPSource-batchsource.md @@ -28,7 +28,7 @@ Properties **Password:** Password to use for authentication. **Format:** Format of the data to read. -The format must be one of 'blob', 'csv', 'delimited', 'json', 'text', 'tsv', or the +The format must be one of 'blob', 'csv', 'delimited', 'json', 'text', 'tsv', 'xls', or the name of any format plugin that you have deployed to your environment. Note that FTP does not support seeking in a file, so formats like avro and parquet cannot be used. If the format is a macro, only the formats listed above can be used. @@ -36,7 +36,20 @@ If the format is 'blob', every input file will be read into a separate record. The 'blob' format also requires a schema that contains a field named 'body' of type 'bytes'. If the format is 'text', the schema must contain a field named 'body' of type 'string'. -**Get Schema:** Auto-detects schema from file. Supported formats are: csv, delimited, tsv, blob and text. +**Sample Size:** The maximum number of rows that will get investigated for automatic data type detection. +The default value is 1000. This is only used when the format is 'xls'. + +**Override:** A list of columns with the corresponding data types for whom the automatic data type detection gets +skipped. This is only used when the format is 'xls'. + +**Terminate Reading After Empty Row:** Specify whether to stop reading after encountering the first empty row. Defaults to false. When false the reader will read all rows in the sheet. This is only used when the format is 'xls'. + +**Select Sheet Using:** Select the sheet by name or number. Default is 'Sheet Number'. This is only used when the format is 'xls'. + +**Sheet Value:** The name/number of the sheet to read from. If not specified, the first sheet will be read. +Sheet Numbers are 0 based, ie first sheet is 0. This is only used when the format is 'xls'. + +**Get Schema:** Auto-detects schema from file. Supported formats are: csv, delimited, tsv, xls, blob and text. Blob - is set by default as field named 'body' of type bytes. @@ -47,7 +60,7 @@ JSON - is not supported. You must manually provide the output schema. **Delimiter:** Delimiter to use when the format is 'delimited'. This will be ignored for other formats. **Use First Row as Header:** Whether to use the first line of each file as the column headers. Supported formats are ' -text', 'csv', 'tsv', and 'delimited'. +text', 'csv', 'tsv', 'xls', and 'delimited'. **Enable Quoted Values** Whether to treat content between quotes as a value. This value will only be used if the format is 'csv', 'tsv' or 'delimited'. For example, if this is set to true, a line that looks like `1, "a, b, c"` will output diff --git a/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java b/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java index 1454aab..a7033b4 100644 --- a/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java +++ b/src/main/java/io/cdap/plugin/batch/source/ftp/FTPConfig.java @@ -22,6 +22,7 @@ import com.google.gson.Gson; import io.cdap.cdap.api.annotation.Description; import io.cdap.cdap.api.annotation.Macro; +import io.cdap.cdap.api.annotation.Name; import io.cdap.cdap.api.data.schema.Schema; import io.cdap.cdap.api.plugin.PluginConfig; import io.cdap.cdap.etl.api.FailureCollector; @@ -58,6 +59,9 @@ public class FTPConfig extends PluginConfig implements FileSourceProperties { private static final Type MAP_STRING_STRING_TYPE = new TypeToken>() { }.getType(); private static final List LOCATION_PROPERTIES = Arrays.asList("type", "host", "path", "user", "password"); + private static final String NAME_SHEET = "sheet"; + private static final String NAME_SHEET_VALUE = "sheetValue"; + private static final String NAME_TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow"; @Description("Name be used to uniquely identify this source for lineage, annotating metadata, etc.") private final String referenceName; @@ -110,13 +114,24 @@ public class FTPConfig extends PluginConfig implements FileSourceProperties { @Macro @Nullable - @Description("Whether to use first row as header. Supported formats are 'text', 'csv', 'tsv', " + + @Description("The maximum number of rows that will get investigated for automatic data type detection.") + private Long sampleSize; + + @Macro + @Nullable + @Description("A list of columns with the corresponding data types for whom the automatic data type detection gets" + + " skipped.") + private String override; + + @Macro + @Nullable + @Description("Whether to use first row as header. Supported formats are 'text', 'csv', 'tsv', 'xls', " + "'delimited'. Default value is false.") private final Boolean skipHeader; @Macro @Description("Format of the data to read. Supported formats are 'avro', 'blob', 'csv', 'delimited', 'json', " - + "'parquet', 'text', or 'tsv'. If no format is given, it will default to 'text'.") + + "'parquet', 'text', or 'tsv', 'xls'. If no format is given, it will default to 'text'.") private final String format; @Macro @@ -148,6 +163,25 @@ public class FTPConfig extends PluginConfig implements FileSourceProperties { @Description("Maximum time in milliseconds to wait for connection initialization before time out.") private final Integer connectTimeout; + @Name(NAME_SHEET) + @Macro + @Nullable + @Description("Select the sheet by name or number. Default is 'Sheet Number'.") + private String sheet; + + @Name(NAME_SHEET_VALUE) + @Macro + @Nullable + @Description("The name/number of the sheet to read from. If not specified, the first sheet will be read." + + "Sheet Numbers are 0 based, ie first sheet is 0.") + private String sheetValue; + + @Name(NAME_TERMINATE_IF_EMPTY_ROW) + @Macro + @Nullable + @Description("Specify whether to stop reading after encountering the first empty row. Defaults to false.") + private String terminateIfEmptyRow; + @VisibleForTesting private FTPConfig(@Nullable String referenceName, String type, String host, @Nullable Integer port, String path, String user, String password, @Nullable String fileSystemProperties, diff --git a/widgets/FTPSource-batchsource.json b/widgets/FTPSource-batchsource.json index a89ee3d..8be8fd6 100644 --- a/widgets/FTPSource-batchsource.json +++ b/widgets/FTPSource-batchsource.json @@ -88,6 +88,10 @@ { "label": "tsv", "value": "tsv" + }, + { + "label": "xls", + "value": "xls" } ] } @@ -96,6 +100,36 @@ "widget-type": "get-schema", "widget-category": "plugin" }, + { + "widget-type": "number", + "label": "Sample Size", + "name": "sampleSize", + "widget-attributes": { + "default": "1000", + "minimum": "1" + } + }, + { + "widget-type": "keyvalue-dropdown", + "label": "Override", + "name": "override", + "widget-attributes": { + "key-placeholder": "Field Name", + "value-placeholder": "Data Type", + "dropdownOptions": [ + "boolean", + "bytes", + "double", + "float", + "int", + "long", + "string", + "date", + "time", + "timestamp" + ] + } + }, { "widget-type": "textbox", "label": "Delimiter", @@ -151,6 +185,42 @@ "label": "False" } } + }, + { + "widget-type": "toggle", + "label": "Terminate Reading After Empty Row", + "name": "terminateIfEmptyRow", + "widget-attributes": { + "default": "false", + "on": { + "value": "true", + "label": "True" + }, + "off": { + "value": "false", + "label": "False" + } + } + }, + { + "widget-type": "select", + "label": "Select Sheet Using", + "name": "sheet", + "widget-attributes": { + "values": [ + "Sheet Name", + "Sheet Number" + ], + "default": "Sheet Number" + } + }, + { + "widget-type": "textbox", + "label": "Sheet Value", + "name": "sheetValue", + "widget-attributes": { + "default": "0" + } } ] }, @@ -257,13 +327,46 @@ { "name": "skipHeader", "condition": { - "expression": "format == 'delimited' || format == 'csv' || format == 'tsv'" + "expression": "format == 'delimited' || format == 'csv' || format == 'tsv' || format == 'xls'" }, "show": [ { "name": "skipHeader" } ] + }, + { + "name": "sheet", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "sheet" + } + ] + }, + { + "name": "sheetValue", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "sheetValue" + } + ] + }, + { + "name": "terminateIfEmptyRow", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "terminateIfEmptyRow" + } + ] } ], "jump-config": {