package ai.platon.scent.tools;

import ai.platon.pulsar.common.LogsKt;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.persist.gora.generated.GWebPage;
import ai.platon.pulsar.skeleton.crawl.filter.ChainedUrlNormalizer;
import ai.platon.scent.ScentSession;
import ai.platon.scent.common.ScentWebPageExtKt;
import ai.platon.scent.common.sites.amazon.AmazonAsinUrlNormalizer;
import ai.platon.scent.dom.HarvestOptions;
import ai.platon.scent.ml.EncodeOptions;
import ai.platon.scent.ml.data.FrameNode;
import ai.platon.scent.ml.data.SimpleDataFrame;
import ai.platon.scent.tools.HarvestTaskExecutor;
import java.lang.management.ManagementFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import kotlin.Metadata;
import kotlin.collections.CollectionsKt;
import kotlin.collections.SetsKt;
import kotlin.enums.EnumEntries;
import kotlin.enums.EnumEntriesKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.Reflection;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.sequences.Sequence;
import kotlin.sequences.SequencesKt;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.slf4j.Logger;

/* compiled from: SequenceHarvestTaskExecutor.kt */
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��:\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0004\n\u0002\u0018\u0002\n\u0002\b\u0005\n\u0002\u0018\u0002\n��\n\u0002\u0010\u0002\n��\n\u0002\u0018\u0002\n\u0002\u0010\u000e\n\u0002\b\u0003\n\u0002\u0010\u000b\n\u0002\b\u0003\u0018��2\u00020\u0001B\u000f\u0012\b\b\u0002\u0010\u0002\u001a\u00020\u0003¢\u0006\u0002\u0010\u0004J\u0014\u0010\u000f\u001a\u00020\u00102\f\u0010\u0011\u001a\b\u0012\u0004\u0012\u00020\u00130\u0012J&\u0010\u0014\u001a\u00020\u00102\f\u0010\u0011\u001a\b\u0012\u0004\u0012\u00020\u00130\u00122\u0006\u0010\u0015\u001a\u00020\u00132\b\b\u0002\u0010\u0016\u001a\u00020\u0017J\u001e\u0010\u0018\u001a\u00020\u00102\f\u0010\u0011\u001a\b\u0012\u0004\u0012\u00020\u00130\u00122\b\b\u0002\u0010\u0015\u001a\u00020\u0013J\u000e\u0010\u0019\u001a\u00020\u00102\u0006\u0010\u0015\u001a\u00020\u0013R\u0011\u0010\u0002\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\u0005\u0010\u0006R\u001a\u0010\u0007\u001a\u00020\bX\u0086\u000e¢\u0006\u000e\n��\u001a\u0004\b\t\u0010\n\"\u0004\b\u000b\u0010\fR\u000e\u0010\r\u001a\u00020\u000eX\u0082\u0004¢\u0006\u0002\n��¨\u0006\u001a"}, d2 = {"Lai/platon/scent/tools/SequenceHarvestTaskExecutor;", "Lai/platon/scent/tools/HarvestTaskExecutor;", "crawler", "Lai/platon/scent/tools/BasicWebHarvester;", "(Lai/platon/scent/tools/BasicWebHarvester;)V", "getCrawler", "()Lai/platon/scent/tools/BasicWebHarvester;", "datasetPath", "Ljava/nio/file/Path;", "getDatasetPath", "()Ljava/nio/file/Path;", "setDatasetPath", "(Ljava/nio/file/Path;)V", "logger", "Lorg/slf4j/Logger;", "check", "", "urls", "Lkotlin/sequences/Sequence;", "", "encode", "args", "annotated", "", "harvest", "kmeans", "scent-boot"})
@SourceDebugExtension({"SMAP\nSequenceHarvestTaskExecutor.kt\nKotlin\n*S Kotlin\n*F\n+ 1 SequenceHarvestTaskExecutor.kt\nai/platon/scent/tools/SequenceHarvestTaskExecutor\n+ 2 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n+ 3 ArraysJVM.kt\nkotlin/collections/ArraysKt__ArraysJVMKt\n+ 4 _Sequences.kt\nkotlin/sequences/SequencesKt___SequencesKt\n+ 5 fake.kt\nkotlin/jvm/internal/FakeKt\n*L\n1#1,72:1\n1549#2:73\n1620#2,3:74\n37#3,2:77\n1324#4,3:79\n1#5:82\n*S KotlinDebug\n*F\n+ 1 SequenceHarvestTaskExecutor.kt\nai/platon/scent/tools/SequenceHarvestTaskExecutor\n*L\n27#1:73\n27#1:74,3\n27#1:77,2\n29#1:79,3\n*E\n"})
/* loaded from: input_file:ai/platon/scent/tools/SequenceHarvestTaskExecutor.class */
public final class SequenceHarvestTaskExecutor extends HarvestTaskExecutor {

    @NotNull
    private final BasicWebHarvester crawler;

    @NotNull
    private final Logger logger;

    @NotNull
    private Path datasetPath;

    /* compiled from: SequenceHarvestTaskExecutor.kt */
    @Metadata(mv = {1, 9, 0}, k = 3, xi = 48)
    /* loaded from: input_file:ai/platon/scent/tools/SequenceHarvestTaskExecutor$EntriesMappings.class */
    public /* synthetic */ class EntriesMappings {
        public static final /* synthetic */ EnumEntries<GWebPage.Field> entries$0 = EnumEntriesKt.enumEntries(GWebPage.Field.values());
    }

    /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
    public SequenceHarvestTaskExecutor(@NotNull BasicWebHarvester basicWebHarvester) {
        super(basicWebHarvester.getSession());
        Intrinsics.checkNotNullParameter(basicWebHarvester, "crawler");
        this.crawler = basicWebHarvester;
        this.logger = LogsKt.getLogger(Reflection.getOrCreateKotlinClass(SequenceHarvestTaskExecutor.class));
        this.datasetPath = HarvestTaskExecutor.Companion.createDatasetPath$default(HarvestTaskExecutor.Companion, null, 1, null);
        ChainedUrlNormalizer.add$default(getSession().getContext().getUrlNormalizer(), new AmazonAsinUrlNormalizer(), (String) null, 2, (Object) null);
    }

    public /* synthetic */ SequenceHarvestTaskExecutor(BasicWebHarvester basicWebHarvester, int i, DefaultConstructorMarker defaultConstructorMarker) {
        this((i & 1) != 0 ? new BasicWebHarvester(null, null, 3, null) : basicWebHarvester);
    }

    @NotNull
    public final BasicWebHarvester getCrawler() {
        return this.crawler;
    }

    @NotNull
    public final Path getDatasetPath() {
        return this.datasetPath;
    }

    public final void setDatasetPath(@NotNull Path path) {
        Intrinsics.checkNotNullParameter(path, "<set-?>");
        this.datasetPath = path;
    }

    public final void check(@NotNull Sequence<String> sequence) {
        Intrinsics.checkNotNullParameter(sequence, "urls");
        Set minus = SetsKt.minus(CollectionsKt.toSet(EntriesMappings.entries$0), GWebPage.Field.PAGE_MODEL);
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(minus, 10));
        Iterator it = minus.iterator();
        while (it.hasNext()) {
            arrayList.add(((GWebPage.Field) it.next()).toString());
        }
        int i = 0;
        for (Object obj : SequencesKt.filter(SequencesKt.map(sequence, new Function1<String, WebPage>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$check$pages$1
            /* JADX INFO: Access modifiers changed from: package-private */
            {
                super(1);
            }

            @NotNull
            public final WebPage invoke(@NotNull String str) {
                Intrinsics.checkNotNullParameter(str, "it");
                return SequenceHarvestTaskExecutor.this.getSession().load(str);
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$check$pages$2
            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return Boolean.valueOf(webPage.getContentLength() > 800000);
            }
        })) {
            int i2 = i;
            i++;
            if (i2 < 0) {
                CollectionsKt.throwIndexOverflow();
            }
            WebPage webPage = (WebPage) obj;
            int i3 = i2 + 1;
            long contentLength = webPage.getContentLength();
            long persistedContentLength = webPage.getPersistedContentLength();
            Map mlLabels = ScentWebPageExtKt.getMlLabels(webPage);
            if (mlLabels != null) {
                mlLabels.values();
            }
            webPage.getUrl();
            System.out.println((Object) (i3 + ". " + contentLength + " | " + i3 + " | " + persistedContentLength + " | " + i3));
            System.out.println((Object) getSession().parse(webPage).getBaseURI());
        }
    }

    public final void encode(@NotNull Sequence<String> sequence, @NotNull String str, final boolean z) {
        Intrinsics.checkNotNullParameter(sequence, "urls");
        Intrinsics.checkNotNullParameter(str, "args");
        Files.deleteIfExists(this.datasetPath);
        final HarvestOptions options = getSession().options(str);
        SimpleDataFrame encodeForElements = getSession().encodeForElements(SequencesKt.asIterable(SequencesKt.mapNotNull(SequencesKt.map(SequencesKt.filter(SequencesKt.map(sequence, new Function1<String, WebPage>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$encode$rootElements$1
            /* JADX INFO: Access modifiers changed from: package-private */
            {
                super(1);
            }

            @NotNull
            public final WebPage invoke(@NotNull String str2) {
                Intrinsics.checkNotNullParameter(str2, "it");
                return SequenceHarvestTaskExecutor.this.getSession().load(str2);
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$encode$rootElements$2
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                boolean z2;
                boolean z3;
                Intrinsics.checkNotNullParameter(webPage, "it");
                if (z) {
                    Map mlLabels = ScentWebPageExtKt.getMlLabels(webPage);
                    if (mlLabels != null) {
                        z3 = !mlLabels.isEmpty();
                    } else {
                        z3 = false;
                    }
                    if (!z3) {
                        z2 = false;
                        return Boolean.valueOf(z2);
                    }
                }
                z2 = true;
                return Boolean.valueOf(z2);
            }
        }), new Function1<WebPage, FeaturedDocument>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$encode$rootElements$3
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final FeaturedDocument invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return ScentSession.DefaultImpls.parse$default(SequenceHarvestTaskExecutor.this.getSession(), webPage, options, false, 4, (Object) null);
            }
        }), new Function1<FeaturedDocument, Element>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$encode$rootElements$4
            @Nullable
            public final Element invoke(@NotNull FeaturedDocument featuredDocument) {
                Intrinsics.checkNotNullParameter(featuredDocument, "it");
                return featuredDocument.selectFirstOrNull("body");
            }
        })), new EncodeOptions(this.datasetPath, true, (List) null, 0, 0, 28, (DefaultConstructorMarker) null), new Function1<Node, Boolean>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$encode$df$1
            @NotNull
            public final Boolean invoke(@NotNull Node node) {
                Intrinsics.checkNotNullParameter(node, "it");
                return true;
            }
        });
        int i = 0;
        Iterator it = encodeForElements.getNodes().iterator();
        while (it.hasNext()) {
            i += ((FrameNode) it.next()).getPoints().size();
        }
        this.logger.info("Dataset size: " + i + ", total documents: " + encodeForElements.getSize() + ", dataset exported:\n" + this.datasetPath);
        this.logger.info("All done.");
    }

    public static /* synthetic */ void encode$default(SequenceHarvestTaskExecutor sequenceHarvestTaskExecutor, Sequence sequence, String str, boolean z, int i, Object obj) {
        if ((i & 4) != 0) {
            z = false;
        }
        sequenceHarvestTaskExecutor.encode(sequence, str, z);
    }

    public final void kmeans(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "args");
        this.logger.info("Use project scent-spark to cluster the dataset");
        this.logger.info("https://github.com/galaxyeye/scent-spark/blob/main/src/main/java/ai/platon/scent/ml/clustering/DomKMeans.java");
    }

    public final void harvest(@NotNull Sequence<String> sequence, @NotNull String str) {
        Intrinsics.checkNotNullParameter(sequence, "urls");
        Intrinsics.checkNotNullParameter(str, "args");
        final HarvestOptions options = getSession().options(str + " -diagnose -vj -trustSamples");
        Sequence<? extends FeaturedDocument> map = SequencesKt.map(SequencesKt.map(sequence, new Function1<String, WebPage>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$harvest$documents$1
            /* JADX INFO: Access modifiers changed from: package-private */
            {
                super(1);
            }

            @NotNull
            public final WebPage invoke(@NotNull String str2) {
                Intrinsics.checkNotNullParameter(str2, "it");
                return SequenceHarvestTaskExecutor.this.getSession().load(str2);
            }
        }), new Function1<WebPage, FeaturedDocument>() { // from class: ai.platon.scent.tools.SequenceHarvestTaskExecutor$harvest$documents$2
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final FeaturedDocument invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return ScentSession.DefaultImpls.parse$default(SequenceHarvestTaskExecutor.this.getSession(), webPage, options, false, 4, (Object) null);
            }
        });
        this.logger.info("{}", ManagementFactory.getRuntimeMXBean().getInputArguments());
        this.crawler.harvest(map, options);
    }

    public static /* synthetic */ void harvest$default(SequenceHarvestTaskExecutor sequenceHarvestTaskExecutor, Sequence sequence, String str, int i, Object obj) {
        if ((i & 2) != 0) {
            str = "";
        }
        sequenceHarvestTaskExecutor.harvest(sequence, str);
    }

    public SequenceHarvestTaskExecutor() {
        this(null, 1, null);
    }
}
