package ai.platon.scent.tools;

import ai.platon.pulsar.common.LogsKt;
import ai.platon.pulsar.dom.Documents;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.dom.nodes.node.ext.ExportPaths;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.skeleton.common.options.LoadOptions;
import ai.platon.pulsar.skeleton.context.PulsarContext;
import ai.platon.pulsar.skeleton.crawl.filter.ScopedUrlNormalizer;
import ai.platon.scent.BasicScentSession;
import ai.platon.scent.analysis.corpus.FullFeaturedDocumentKt;
import ai.platon.scent.dom.HNormUrl;
import ai.platon.scent.dom.HarvestOptions;
import ai.platon.scent.dom.nodes.AnchorGroup;
import ai.platon.scent.entities.HarvestResult;
import ai.platon.scent.ml.harvest.HarvestProject;
import ai.platon.scent.ql.h2.context.ScentSQLContexts;
import ai.platon.scent.skeleton.ScentContext;
import ai.platon.scent.skeleton.ScentSession;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.SortedSet;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Stream;
import kotlin.Metadata;
import kotlin.Unit;
import kotlin.collections.CollectionsKt;
import kotlin.coroutines.CoroutineContext;
import kotlin.io.path.PathsKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.sequences.Sequence;
import kotlin.sequences.SequencesKt;
import kotlin.text.StringsKt;
import kotlinx.coroutines.BuildersKt;
import org.apache.commons.io.FileUtils;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* compiled from: BasicWebHarvester.kt */
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��x\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010\u000e\n\u0002\b\u0002\n\u0002\u0010\u0002\n��\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0002\b\u0004\n\u0002\u0010\u001c\n��\n\u0002\u0010 \n\u0002\u0018\u0002\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010\b\n\u0002\b\u0002\b\u0016\u0018��2\u00020\u0001B\u001f\u0012\f\b\u0002\u0010\u0002\u001a\u00060\u0003j\u0002`\u0004\u0012\n\b\u0002\u0010\u0005\u001a\u0004\u0018\u00010\u0006¢\u0006\u0002\u0010\u0007J\u0018\u0010\r\u001a\u0004\u0018\u00010\u000e2\u0006\u0010\u000f\u001a\u00020\u00102\u0006\u0010\u0011\u001a\u00020\u0010J\u0016\u0010\u0012\u001a\u00020\u00132\u0006\u0010\u0014\u001a\u00020\u00152\u0006\u0010\u0016\u001a\u00020\u0017J\u000e\u0010\u0018\u001a\u00020\u00132\u0006\u0010\u0019\u001a\u00020\u001aJ\u000e\u0010\u0018\u001a\u00020\u00132\u0006\u0010\u001b\u001a\u00020\u0010J\b\u0010\u001c\u001a\u00020\u0013H\u0016J\u0016\u0010\u001d\u001a\u00020\u00152\u0006\u0010\u000f\u001a\u00020\u00102\u0006\u0010\u0016\u001a\u00020\u0017J\u0016\u0010\u001d\u001a\u00020\u00152\u0006\u0010\u000f\u001a\u00020\u00102\u0006\u0010\u0011\u001a\u00020\u0010J\u001c\u0010\u001d\u001a\u00020\u00152\f\u0010\u001e\u001a\b\u0012\u0004\u0012\u00020\u00100\u001f2\u0006\u0010\u0011\u001a\u00020\u0010J\u001c\u0010\u001d\u001a\u00020\u00152\f\u0010 \u001a\b\u0012\u0004\u0012\u00020\"0!2\u0006\u0010\u0016\u001a\u00020\u0017J\u001c\u0010\u001d\u001a\u00020\u00152\f\u0010 \u001a\b\u0012\u0004\u0012\u00020\"0#2\u0006\u0010\u0016\u001a\u00020\u0017J$\u0010$\u001a\b\u0012\u0004\u0012\u00020\"0#2\u0006\u0010%\u001a\u00020&2\u0006\u0010'\u001a\u00020(2\u0006\u0010)\u001a\u00020(R\u000e\u0010\b\u001a\u00020\tX\u0082\u0004¢\u0006\u0002\n��R\u0016\u0010\n\u001a\n \f*\u0004\u0018\u00010\u000b0\u000bX\u0082\u0004¢\u0006\u0002\n��¨\u0006*"}, d2 = {"Lai/platon/scent/tools/BasicWebHarvester;", "Lai/platon/scent/tools/SimpleCrawler;", "context", "Lai/platon/scent/skeleton/ScentContext;", "Lai/platon/scent/ScentContext;", "normalizer", "Lai/platon/pulsar/skeleton/crawl/filter/ScopedUrlNormalizer;", "(Lai/platon/scent/skeleton/ScentContext;Lai/platon/pulsar/skeleton/crawl/filter/ScopedUrlNormalizer;)V", "closed", "Ljava/util/concurrent/atomic/AtomicBoolean;", "logger", "Lorg/slf4j/Logger;", "kotlin.jvm.PlatformType", "arrangeDocument", "Lai/platon/scent/dom/nodes/AnchorGroup;", "portalUrl", "", "args", "buildViews", "", "result", "Lai/platon/scent/entities/HarvestResult;", "options", "Lai/platon/scent/dom/HarvestOptions;", "clearViews", "project", "Lai/platon/scent/ml/harvest/HarvestProject;", "projectId", "close", "harvest", "urls", "", "documents", "", "Lai/platon/pulsar/dom/FeaturedDocument;", "Lkotlin/sequences/Sequence;", "loadDocuments", "htmlBaseDir", "Ljava/nio/file/Path;", "start", "", "limit", "scent-boot"})
@SourceDebugExtension({"SMAP\nBasicWebHarvester.kt\nKotlin\n*S Kotlin\n*F\n+ 1 BasicWebHarvester.kt\nai/platon/scent/tools/BasicWebHarvester\n+ 2 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n+ 3 fake.kt\nkotlin/jvm/internal/FakeKt\n*L\n1#1,127:1\n1855#2:128\n1864#2,3:129\n1549#2:132\n1620#2,3:133\n1549#2:136\n1620#2,3:137\n1856#2:141\n1#3:140\n*S KotlinDebug\n*F\n+ 1 BasicWebHarvester.kt\nai/platon/scent/tools/BasicWebHarvester\n*L\n40#1:128\n41#1:129,3\n43#1:132\n43#1:133,3\n44#1:136\n44#1:137,3\n40#1:141\n*E\n"})
/* loaded from: input_file:ai/platon/scent/tools/BasicWebHarvester.class */
public class BasicWebHarvester extends SimpleCrawler {
    private final Logger logger;

    @NotNull
    private final AtomicBoolean closed;

    /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
    public BasicWebHarvester(@NotNull ScentContext scentContext, @Nullable ScopedUrlNormalizer scopedUrlNormalizer) {
        super((PulsarContext) scentContext, scopedUrlNormalizer);
        Intrinsics.checkNotNullParameter(scentContext, "context");
        this.logger = LoggerFactory.getLogger(BasicWebHarvester.class);
        this.closed = new AtomicBoolean();
    }

    public /* synthetic */ BasicWebHarvester(ScentContext scentContext, ScopedUrlNormalizer scopedUrlNormalizer, int i, DefaultConstructorMarker defaultConstructorMarker) {
        this((i & 1) != 0 ? (ScentContext) ScentSQLContexts.INSTANCE.create() : scentContext, (i & 2) != 0 ? null : scopedUrlNormalizer);
    }

    @Nullable
    public final AnchorGroup arrangeDocument(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(str2, "args");
        HNormUrl normalize$default = ScentSession.DefaultImpls.normalize$default(getSession(), str, getSession().options(str2), false, 4, (Object) null);
        LoadOptions hOptions = normalize$default.getHOptions();
        WebPage load = getSession().load(normalize$default);
        FeaturedDocument parse = getSession().parse(load);
        SortedSet arrangeLinks = getSession().arrangeLinks(normalize$default, parse);
        this.logger.info("------------------------------");
        for (AnchorGroup anchorGroup : CollectionsKt.take(arrangeLinks, 1)) {
            int i = 0;
            for (Object obj : CollectionsKt.take(CollectionsKt.shuffled(anchorGroup.getUrlStrings()), 10)) {
                int i2 = i;
                i++;
                if (i2 < 0) {
                    CollectionsKt.throwIndexOverflow();
                }
                System.out.println((Object) ((1 + i2) + ".\t" + ((String) obj)));
            }
            List<String> take = CollectionsKt.take(anchorGroup.getUrlStrings(), hOptions.getTopLinks());
            ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(take, 10));
            for (String str3 : take) {
                BasicScentSession session = getSession();
                Intrinsics.checkNotNull(str3);
                arrayList.add(session.load(str3, hOptions));
            }
            ArrayList arrayList2 = arrayList;
            ArrayList arrayList3 = new ArrayList(CollectionsKt.collectionSizeOrDefault(arrayList2, 10));
            Iterator it = arrayList2.iterator();
            while (it.hasNext()) {
                arrayList3.add(ScentSession.DefaultImpls.parse$default(getSession(), (WebPage) it.next(), hOptions, false, 4, (Object) null));
            }
            getSession().arrangeDocuments(normalize$default, load, CollectionsKt.asSequence(arrayList3));
        }
        FullFeaturedDocumentKt.annotateNodes(parse, hOptions);
        ScentSession.DefaultImpls.export$default(getSession(), parse, ExportPaths.Type.PORTAL, false, 4, (Object) null);
        return (AnchorGroup) CollectionsKt.firstOrNull(arrangeLinks);
    }

    @NotNull
    public final HarvestResult harvest(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(str2, "args");
        return harvest(str, getSession().options(str2));
    }

    @NotNull
    public final HarvestResult harvest(@NotNull List<? extends FeaturedDocument> list, @NotNull HarvestOptions harvestOptions) {
        Intrinsics.checkNotNullParameter(list, "documents");
        Intrinsics.checkNotNullParameter(harvestOptions, "options");
        HarvestResult harvestResult = (HarvestResult) BuildersKt.runBlocking$default((CoroutineContext) null, new BasicWebHarvester$harvest$result$1(this, list, harvestOptions, null), 1, (Object) null);
        buildViews(harvestResult, harvestOptions);
        return harvestResult;
    }

    @NotNull
    public final HarvestResult harvest(@NotNull String str, @NotNull HarvestOptions harvestOptions) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(harvestOptions, "options");
        HarvestResult harvestResult = (HarvestResult) BuildersKt.runBlocking$default((CoroutineContext) null, new BasicWebHarvester$harvest$result$2(harvestOptions, this, str, null), 1, (Object) null);
        buildViews(harvestResult, harvestOptions);
        return harvestResult;
    }

    @NotNull
    public final HarvestResult harvest(@NotNull Iterable<String> iterable, @NotNull String str) {
        Intrinsics.checkNotNullParameter(iterable, "urls");
        Intrinsics.checkNotNullParameter(str, "args");
        return harvest(SequencesKt.map(CollectionsKt.asSequence(iterable), new Function1<String, FeaturedDocument>() { // from class: ai.platon.scent.tools.BasicWebHarvester$harvest$1
            /* JADX INFO: Access modifiers changed from: package-private */
            {
                super(1);
            }

            @NotNull
            public final FeaturedDocument invoke(@NotNull String str2) {
                Intrinsics.checkNotNullParameter(str2, "it");
                return BasicWebHarvester.this.getSession().loadDocument(str2);
            }
        }), getSession().options(str));
    }

    @NotNull
    public final HarvestResult harvest(@NotNull Sequence<? extends FeaturedDocument> sequence, @NotNull HarvestOptions harvestOptions) {
        Intrinsics.checkNotNullParameter(sequence, "documents");
        Intrinsics.checkNotNullParameter(harvestOptions, "options");
        HarvestResult harvest = getSession().harvest(sequence, harvestOptions);
        this.logger.info("Harvest finished.");
        this.logger.info("Ready to build views for the harvest result ...");
        buildViews(harvest, harvestOptions);
        return harvest;
    }

    public final void buildViews(@NotNull HarvestResult harvestResult, @NotNull HarvestOptions harvestOptions) {
        Intrinsics.checkNotNullParameter(harvestResult, "result");
        Intrinsics.checkNotNullParameter(harvestOptions, "options");
        try {
            Path path = (Path) CollectionsKt.firstOrNull(getSession().buildAll(harvestResult.getTableGroup(), harvestOptions).keySet());
            Path parent = path != null ? path.getParent() : null;
            if (parent == null) {
                return;
            }
            this.logger.info("Harvest result: file://" + parent);
        } catch (Exception e) {
            LogsKt.warnUnexpected(this, e, "Failed to report harvest result", new Object[0]);
        }
    }

    public final void clearViews(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "projectId");
        FileUtils.deleteDirectory(new HarvestProject(str).getResultBaseDir().toFile());
    }

    public final void clearViews(@NotNull HarvestProject harvestProject) {
        Intrinsics.checkNotNullParameter(harvestProject, "project");
        FileUtils.deleteDirectory(harvestProject.getResultBaseDir().toFile());
    }

    @NotNull
    public final Sequence<FeaturedDocument> loadDocuments(@NotNull Path path, int i, int i2) {
        long count;
        Intrinsics.checkNotNullParameter(path, "htmlBaseDir");
        LinkOption[] linkOptionArr = new LinkOption[0];
        if (Files.notExists(path, (LinkOption[]) Arrays.copyOf(linkOptionArr, linkOptionArr.length))) {
            count = 0;
        } else {
            Stream<Path> list = Files.list(path);
            BasicWebHarvester$loadDocuments$count$1 basicWebHarvester$loadDocuments$count$1 = new Function1<Path, Boolean>() { // from class: ai.platon.scent.tools.BasicWebHarvester$loadDocuments$count$1
                @NotNull
                public final Boolean invoke(Path path2) {
                    return Boolean.valueOf(StringsKt.endsWith$default(path2.getFileName().toString(), ".html", false, 2, (Object) null));
                }
            };
            count = list.filter((v1) -> {
                return loadDocuments$lambda$7(r1, v1);
            }).count();
        }
        long j = count;
        if (j == 0) {
            return SequencesKt.sequenceOf(new FeaturedDocument[0]);
        }
        if (j < 20) {
            this.logger.warn("Too few samples, might not generate a good result");
        }
        return SequencesKt.onEach(SequencesKt.map(SequencesKt.take(SequencesKt.drop(CollectionsKt.asSequence(PathsKt.listDirectoryEntries(path, "*.html")), i), i2), new Function1<Path, FeaturedDocument>() { // from class: ai.platon.scent.tools.BasicWebHarvester$loadDocuments$documents$1
            @NotNull
            public final FeaturedDocument invoke(@NotNull Path path2) {
                Intrinsics.checkNotNullParameter(path2, "it");
                return Documents.INSTANCE.parse(path2, "UTF-8", path2.toString());
            }
        }), new Function1<FeaturedDocument, Unit>() { // from class: ai.platon.scent.tools.BasicWebHarvester$loadDocuments$documents$2
            public final void invoke(@NotNull FeaturedDocument featuredDocument) {
                Intrinsics.checkNotNullParameter(featuredDocument, "it");
                Document document = featuredDocument.getDocument();
                String normalizedURI = featuredDocument.getNormalizedURI();
                if (normalizedURI == null) {
                    normalizedURI = featuredDocument.getBaseURI();
                }
                document.setBaseUri(normalizedURI);
            }

            public /* bridge */ /* synthetic */ Object invoke(Object obj) {
                invoke((FeaturedDocument) obj);
                return Unit.INSTANCE;
            }
        });
    }

    @Override // ai.platon.scent.tools.SimpleCrawler, java.lang.AutoCloseable
    public void close() {
        if (this.closed.compareAndSet(false, true)) {
        }
    }

    private static final boolean loadDocuments$lambda$7(Function1 function1, Object obj) {
        Intrinsics.checkNotNullParameter(function1, "$tmp0");
        return ((Boolean) function1.invoke(obj)).booleanValue();
    }

    public BasicWebHarvester() {
        this(null, null, 3, null);
    }
}
