package ai.platon.scent.tools;

import ai.platon.pulsar.common.ResourceLoader;
import ai.platon.pulsar.common.urls.Hyperlink;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.skeleton.common.options.LoadOptions;
import ai.platon.pulsar.skeleton.crawl.filter.ChainedUrlNormalizer;
import ai.platon.scent.BasicScentSession;
import ai.platon.scent.common.MLPaths;
import ai.platon.scent.dom.HarvestOptions;
import ai.platon.scent.ml.EncodeOptions;
import ai.platon.scent.ml.data.SimpleDataFrame;
import ai.platon.scent.ml.encoding.EncodeProject;
import ai.platon.scent.ml.harvest.HarvestProject;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.attribute.FileAttribute;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import kotlin.Metadata;
import kotlin.Pair;
import kotlin.TuplesKt;
import kotlin.collections.CollectionsKt;
import kotlin.io.path.PathsKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.functions.Function2;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.sequences.SequencesKt;
import kotlin.text.StringsKt;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.kotlinx.dataframe.DataFrame;
import org.jetbrains.kotlinx.dataframe.aggregation.AggregateGroupedDsl;
import org.jetbrains.kotlinx.dataframe.api.ChunkedKt;
import org.jetbrains.kotlinx.dataframe.api.ColumnsSelectionDsl;
import org.jetbrains.kotlinx.dataframe.api.FirstKt;
import org.jetbrains.kotlinx.dataframe.api.GroupByKt;
import org.jetbrains.kotlinx.dataframe.api.PrintKt;
import org.jetbrains.kotlinx.dataframe.api.RenameKt;
import org.jetbrains.kotlinx.dataframe.api.SelectKt;
import org.jetbrains.kotlinx.dataframe.api.SplitKt;
import org.jetbrains.kotlinx.dataframe.columns.ColumnsResolver;
import org.jetbrains.kotlinx.dataframe.io.CsvKt;
import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData;
import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration;
import org.jetbrains.kotlinx.dataframe.io.GuessKt;
import org.jetbrains.kotlinx.dataframe.io.HtmlKt;
import org.jetbrains.kotlinx.dataframe.jupyter.CellRenderer;
import org.jsoup.nodes.Node;

/* compiled from: EBayHarvest.kt */
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��V\n\u0002\u0018\u0002\n\u0002\u0010��\n��\n\u0002\u0010\u000e\n\u0002\b\u0005\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010 \n\u0002\u0018\u0002\n��\n\u0002\u0010\u0002\n\u0002\b\u0004\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n\u0002\u0010\u000b\n\u0002\b\u0005\u0018��2\u00020\u0001B\u0017\u0012\u0006\u0010\u0002\u001a\u00020\u0003\u0012\b\b\u0002\u0010\u0004\u001a\u00020\u0003¢\u0006\u0002\u0010\u0005J\f\u0010\u0014\u001a\b\u0012\u0004\u0012\u00020\u00160\u0015J\u0006\u0010\u0017\u001a\u00020\u0018J\u0006\u0010\u0019\u001a\u00020\u0018J(\u0010\u001a\u001a\u00020\u00182\f\u0010\u001b\u001a\b\u0012\u0004\u0012\u00020\u00030\u00152\u0012\u0010\u001c\u001a\u000e\u0012\u0004\u0012\u00020\u001e\u0012\u0004\u0012\u00020\u001f0\u001dJ\u0010\u0010 \u001a\u00020\u00032\u0006\u0010!\u001a\u00020\u0003H\u0002J\u001a\u0010\"\u001a\b\u0012\u0004\u0012\u00020\u00030\u00152\f\u0010#\u001a\b\u0012\u0004\u0012\u00020\u00030\u0015R\u0011\u0010\u0002\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\u0006\u0010\u0007R\u000e\u0010\b\u001a\u00020\tX\u0082\u0004¢\u0006\u0002\n��R\u0016\u0010\n\u001a\n \f*\u0004\u0018\u00010\u000b0\u000bX\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\r\u001a\u00020\u000eX\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\u000f\u001a\u00020\u0010X\u0082\u0004¢\u0006\u0002\n��R\u0011\u0010\u0004\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\u0011\u0010\u0007R\u000e\u0010\u0012\u001a\u00020\u0013X\u0082\u0004¢\u0006\u0002\n��¨\u0006$"}, d2 = {"Lai/platon/scent/tools/EBayHarvester;", "", "args", "", "projectInfo", "(Ljava/lang/String;Ljava/lang/String;)V", "getArgs", "()Ljava/lang/String;", "crawler", "Lai/platon/scent/tools/SimpleCrawler;", "datasetPath", "Ljava/nio/file/Path;", "kotlin.jvm.PlatformType", "encodeOptions", "Lai/platon/scent/ml/EncodeOptions;", "project", "Lai/platon/scent/ml/encoding/EncodeProject;", "getProjectInfo", "session", "Lai/platon/scent/BasicScentSession;", "collectListPageLinks", "", "Lai/platon/pulsar/common/urls/Hyperlink;", "createHarvestResultDatasetView", "", "createPredictionAndMinimalDataset", "encodeAll", "urls", "nodeFilter", "Lkotlin/Function1;", "Lorg/jsoup/nodes/Node;", "", "getOrCreateProjectInfo", "more", "loadOutPages", "portalUrls", "scent-boot"})
@SourceDebugExtension({"SMAP\nEBayHarvest.kt\nKotlin\n*S Kotlin\n*F\n+ 1 EBayHarvest.kt\nai/platon/scent/tools/EBayHarvester\n+ 2 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n+ 3 fake.kt\nkotlin/jvm/internal/FakeKt\n+ 4 forEach.kt\norg/jetbrains/kotlinx/dataframe/api/ForEachKt\n*L\n1#1,203:1\n1360#2:204\n1446#2,5:205\n1611#2:210\n1855#2:211\n1856#2:213\n1612#2:214\n1549#2:215\n1620#2,3:216\n1549#2:220\n1620#2,3:221\n1360#2:224\n1446#2,5:225\n1864#2,3:231\n1#3:212\n1#3:219\n12#4:230\n*S KotlinDebug\n*F\n+ 1 EBayHarvest.kt\nai/platon/scent/tools/EBayHarvester\n*L\n82#1:204\n82#1:205,5\n83#1:210\n83#1:211\n83#1:213\n83#1:214\n84#1:215\n84#1:216,3\n99#1:220\n99#1:221,3\n100#1:224\n100#1:225,5\n138#1:231,3\n83#1:212\n138#1:230\n*E\n"})
/* loaded from: input_file:ai/platon/scent/tools/EBayHarvester.class */
public final class EBayHarvester {

    @NotNull
    private final String args;

    @NotNull
    private final String projectInfo;

    @NotNull
    private final SimpleCrawler crawler;

    @NotNull
    private final BasicScentSession session;

    @NotNull
    private final EncodeProject project;
    private final Path datasetPath;

    @NotNull
    private final EncodeOptions encodeOptions;

    public EBayHarvester(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "args");
        Intrinsics.checkNotNullParameter(str2, "projectInfo");
        this.args = str;
        this.projectInfo = str2;
        this.crawler = new SimpleCrawler(null, null, 3, null);
        this.session = this.crawler.getSession();
        this.project = EncodeProject.Companion.newProject(EncodeProject.Type.PREDICT);
        this.datasetPath = this.project.getDatasetPath();
        this.encodeOptions = new EncodeOptions(this.datasetPath, false, (List) null, 0, 0, 30, (DefaultConstructorMarker) null);
        Files.createDirectories(this.datasetPath.getParent(), new FileAttribute[0]);
        ChainedUrlNormalizer.add$default(this.session.getContext().getUrlNormalizer(), new EBayProductUrlNormalizer(), (String) null, 2, (Object) null);
    }

    public /* synthetic */ EBayHarvester(String str, String str2, int i, DefaultConstructorMarker defaultConstructorMarker) {
        this(str, (i & 2) != 0 ? "" : str2);
    }

    @NotNull
    public final String getArgs() {
        return this.args;
    }

    @NotNull
    public final String getProjectInfo() {
        return this.projectInfo;
    }

    @NotNull
    public final List<Hyperlink> collectListPageLinks() {
        return FeaturedDocument.selectHyperlinks$default(this.session.loadDocument("https://www.ebay.com/b/Apple/bn_21819543"), "a[href~=/b/]", 0, 0, 6, (Object) null);
    }

    @NotNull
    public final List<String> loadOutPages(@NotNull List<String> list) {
        Intrinsics.checkNotNullParameter(list, "portalUrls");
        HarvestOptions options = this.session.options(this.args);
        LoadOptions createItemOptions = options.createItemOptions();
        List loadDocuments = this.session.loadDocuments(list, options);
        EBayProductUrlNormalizer eBayProductUrlNormalizer = new EBayProductUrlNormalizer();
        List list2 = loadDocuments;
        ArrayList arrayList = new ArrayList();
        Iterator it = list2.iterator();
        while (it.hasNext()) {
            CollectionsKt.addAll(arrayList, FeaturedDocument.selectHyperlinks$default((FeaturedDocument) it.next(), options.getOutLinkSelector(), 0, 0, 6, (Object) null));
        }
        ArrayList arrayList2 = arrayList;
        HashSet hashSet = new HashSet();
        Iterator it2 = arrayList2.iterator();
        while (it2.hasNext()) {
            Hyperlink normalize$default = EBayProductUrlNormalizer.normalize$default(eBayProductUrlNormalizer, (Hyperlink) it2.next(), null, 2, null);
            if (normalize$default != null) {
                hashSet.add(normalize$default);
            }
        }
        HashSet hashSet2 = hashSet;
        ArrayList arrayList3 = new ArrayList(CollectionsKt.collectionSizeOrDefault(hashSet2, 10));
        Iterator it3 = hashSet2.iterator();
        while (it3.hasNext()) {
            arrayList3.add(((Hyperlink) it3.next()).getUrl());
        }
        ArrayList arrayList4 = arrayList3;
        this.session.submitAll(arrayList4, createItemOptions);
        this.session.getContext().await();
        return arrayList4;
    }

    public final void encodeAll(@NotNull List<String> list, @NotNull final Function1<? super Node, Boolean> function1) {
        Intrinsics.checkNotNullParameter(list, "urls");
        Intrinsics.checkNotNullParameter(function1, "nodeFilter");
        ArrayList componentSelectors = this.session.options(this.args).createItemOptions().getComponentSelectors();
        if (!(!componentSelectors.isEmpty())) {
            throw new IllegalArgumentException("Component selectors cannot be empty!".toString());
        }
        List<String> list2 = list;
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(list2, 10));
        Iterator<T> it = list2.iterator();
        while (it.hasNext()) {
            arrayList.add(this.session.loadDocument((String) it.next()));
        }
        ArrayList arrayList2 = arrayList;
        ArrayList arrayList3 = new ArrayList();
        Iterator it2 = arrayList2.iterator();
        while (it2.hasNext()) {
            CollectionsKt.addAll(arrayList3, FeaturedDocument.select$default((FeaturedDocument) it2.next(), CollectionsKt.joinToString$default(componentSelectors, (CharSequence) null, (CharSequence) null, (CharSequence) null, 0, (CharSequence) null, (Function1) null, 63, (Object) null), 0, 0, 6, (Object) null));
        }
        SimpleDataFrame encodeForElements = this.session.encodeForElements(arrayList3, this.encodeOptions, new Function1<Node, Boolean>() { // from class: ai.platon.scent.tools.EBayHarvester$encodeAll$dataFrame$1
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull Node node) {
                Intrinsics.checkNotNullParameter(node, "it");
                return (Boolean) function1.invoke(node);
            }
        });
        System.out.println((Object) ("Total " + encodeForElements.getRecordCount() + " records in " + encodeForElements.getSize() + " documents are encoded."));
        encodeForElements.export();
        if (!Files.exists(this.datasetPath, new LinkOption[0])) {
            System.out.println((Object) ("Dataset is not saved to " + this.datasetPath));
            return;
        }
        Path path = this.datasetPath;
        Path path2 = this.datasetPath;
        Intrinsics.checkNotNullExpressionValue(path2, "datasetPath");
        Path resolveSibling = path.resolveSibling(PathsKt.getNameWithoutExtension(path2) + ".info.txt");
        Files.writeString(resolveSibling, getOrCreateProjectInfo("dataset base directory: " + resolveSibling), new OpenOption[0]);
        MLPaths mLPaths = MLPaths.INSTANCE;
        Path path3 = this.datasetPath;
        Intrinsics.checkNotNullExpressionValue(path3, "datasetPath");
        mLPaths.copyToLearnUnsupervised(path3);
    }

    private final String getOrCreateProjectInfo(String str) {
        return !StringsKt.isBlank(this.projectInfo) ? this.projectInfo + str : StringsKt.trimIndent("\n            buildTime: " + OffsetDateTime.now() + "\n            nodeFilter: it.isRegularText && it.nthScreen <= 2\n            args: " + this.args + "\n            " + str + "\n    ");
    }

    public final void createPredictionAndMinimalDataset() {
        HarvestProject harvestProject = new HarvestProject(this.project.getId());
        DataFrame.Companion companion = DataFrame.Companion;
        File file = harvestProject.getEncodeProject().getDatasetPath().toFile();
        Intrinsics.checkNotNullExpressionValue(file, "toFile(...)");
        DataFrame rename = RenameKt.rename(SelectKt.select(GuessKt.read$default(companion, file, (List) null, 2, (Object) null), new String[]{"label", "prediction", "top-g0", "left-g0", "width-g0", "height-g0", "seq-g0", "text", "url"}), new Pair[]{TuplesKt.to("top-g0", "top"), TuplesKt.to("left-g0", "left"), TuplesKt.to("width-g0", "width"), TuplesKt.to("height-g0", "height"), TuplesKt.to("seq-g0", "seq")});
        FileUtils.deleteDirectory(harvestProject.getPredictionAndMinimalFeaturesBaseDir().toFile());
        Files.createDirectories(harvestProject.getPredictionAndMinimalFeaturesBaseDir(), new FileAttribute[0]);
        int i = 0;
        for (Object obj : ChunkedKt.chunked$default(rename, 2000, (String) null, 2, (Object) null).values()) {
            int i2 = i;
            i++;
            if (i2 < 0) {
                CollectionsKt.throwIndexOverflow();
            }
            DataFrame dataFrame = (DataFrame) obj;
            File file2 = harvestProject.getPredictionAndMinimalFeaturesBaseDir().resolve("part-" + StringUtils.leftPad(String.valueOf(i2), 5, "0") + "-" + UUID.randomUUID() + ".csv").toFile();
            Intrinsics.checkNotNull(file2);
            CsvKt.writeCSV$default(dataFrame, file2, (CSVFormat) null, 2, (Object) null);
        }
    }

    public final void createHarvestResultDatasetView() {
        DataFrame.Companion companion = DataFrame.Companion;
        File file = this.project.getDatasetPath().toFile();
        Intrinsics.checkNotNullExpressionValue(file, "toFile(...)");
        DataFrame into$default = SplitKt.into$default(SplitKt.split(SelectKt.select(RenameKt.rename(GroupByKt.groupBy(RenameKt.rename(SelectKt.select(GuessKt.read$default(companion, file, (List) null, 2, (Object) null), new String[]{"label", "prediction", "top-g0", "left-g0", "width-g0", "height-g0", "seq-g0", "text", "url"}), new Pair[]{TuplesKt.to("top-g0", "top"), TuplesKt.to("left-g0", "left"), TuplesKt.to("width-g0", "width"), TuplesKt.to("height-g0", "height"), TuplesKt.to("seq-g0", "seq")}), new String[]{"url"}).aggregate(new Function2<AggregateGroupedDsl<? extends Object>, AggregateGroupedDsl<? extends Object>, List<Object>>() { // from class: ai.platon.scent.tools.EBayHarvester$createHarvestResultDatasetView$1
            @NotNull
            public final List<Object> invoke(@NotNull AggregateGroupedDsl<? extends Object> aggregateGroupedDsl, @NotNull AggregateGroupedDsl<? extends Object> aggregateGroupedDsl2) {
                Intrinsics.checkNotNullParameter(aggregateGroupedDsl, "$this$aggregate");
                Intrinsics.checkNotNullParameter(aggregateGroupedDsl2, "it");
                Object first = FirstKt.first(aggregateGroupedDsl.get("url"));
                List<Object> mutableList = SequencesKt.toMutableList(SequencesKt.asSequence(aggregateGroupedDsl.get("text").iterator()));
                mutableList.add(0, first);
                return mutableList;
            }
        }), new Pair[]{TuplesKt.to("aggregated", "texts")}), new String[]{"texts"}), new Function2<ColumnsSelectionDsl<? extends Object>, ColumnsSelectionDsl<? extends Object>, ColumnsResolver<? extends List<? extends Integer>>>() { // from class: ai.platon.scent.tools.EBayHarvester$createHarvestResultDatasetView$2
            @NotNull
            public final ColumnsResolver<List<Integer>> invoke(@NotNull ColumnsSelectionDsl<? extends Object> columnsSelectionDsl, @NotNull ColumnsSelectionDsl<? extends Object> columnsSelectionDsl2) {
                Intrinsics.checkNotNullParameter(columnsSelectionDsl, "$this$split");
                Intrinsics.checkNotNullParameter(columnsSelectionDsl2, "it");
                return columnsSelectionDsl.stringInvokeTyped("texts");
            }
        }), new String[0], (Function2) null, 2, (Object) null);
        PrintKt.print$default(into$default, 0, 0, false, false, false, false, 63, (Object) null);
        HarvestProject harvestProject = new HarvestProject(this.project.getId());
        FileUtils.deleteDirectory(harvestProject.getPredictionAndMinimalFeaturesBaseDir().toFile());
        Files.createDirectories(harvestProject.getPredictionAndMinimalFeaturesBaseDir(), new FileAttribute[0]);
        File file2 = harvestProject.getPredictionAndMinimalFeaturesBaseDir().resolve("harvest-result-dataset-view.html").toFile();
        DataFrameHtmlData plus = HtmlKt.toHTML$default(into$default, (DisplayConfiguration) null, (CellRenderer) null, (Function1) null, 7, (Object) null).plus(new DataFrameHtmlData(ResourceLoader.INSTANCE.readString("wwwroot/template/page.table.css"), (String) null, (String) null, 6, (DefaultConstructorMarker) null));
        Intrinsics.checkNotNull(file2);
        plus.writeHTML(file2);
    }
}
