package ai.platon.scent.extract;

import ai.platon.pulsar.common.DateTimes;
import ai.platon.pulsar.common.LangKt;
import ai.platon.pulsar.common.config.ImmutableConfig;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.dom.model.FragmentCategory;
import ai.platon.pulsar.dom.model.PageAttribute;
import ai.platon.pulsar.dom.model.PageEntity;
import ai.platon.pulsar.dom.model.PageEntityKt;
import ai.platon.scent.dom.nodes.data.BlockLabel;
import ai.platon.scent.extract.common.extractor.ConfiguredExtractor;
import ai.platon.scent.extract.common.extractor.ExtractorBeanFactory;
import ai.platon.scent.extract.common.extractor.regex.ConfiguredRegexExtractor;
import com.google.common.net.InternetDomainName;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import kotlin.Deprecated;
import kotlin.Lazy;
import kotlin.Metadata;
import kotlin.NotImplementedError;
import kotlin.collections.CollectionsKt;
import kotlin.jvm.functions.Function0;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.Reflection;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.jvm.internal.StringCompanionObject;
import kotlin.reflect.KClass;
import kotlin.reflect.KFunction;
import kotlin.text.Regex;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Element;

/* compiled from: PageExtractor.kt */
@Deprecated(message = "Use ML and X-SQL system instead")
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��<\n\u0002\u0018\u0002\n\u0002\u0010��\n��\n\u0002\u0018\u0002\n\u0002\b\u0004\n\u0002\u0018\u0002\n\u0002\b\u0005\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n��\n\u0002\u0010\u0002\n\u0002\b\u0004\n\u0002\u0010\u000e\n��\n\u0002\u0018\u0002\n\u0002\b\u0006\b\u0007\u0018��2\u00020\u0001B\r\u0012\u0006\u0010\u0002\u001a\u00020\u0003¢\u0006\u0002\u0010\u0004J\u000e\u0010\r\u001a\u00020\u000e2\u0006\u0010\u000f\u001a\u00020\u0010J\u0018\u0010\u0011\u001a\u00020\u00122\u0006\u0010\u000f\u001a\u00020\u00102\u0006\u0010\u0013\u001a\u00020\u000eH\u0002J\u0010\u0010\u0014\u001a\u00020\u00122\u0006\u0010\u0013\u001a\u00020\u000eH\u0002J\u0018\u0010\u0015\u001a\u00020\u00122\u0006\u0010\u000f\u001a\u00020\u00102\u0006\u0010\u0013\u001a\u00020\u000eH\u0002J\u0010\u0010\u0016\u001a\u00020\u00172\u0006\u0010\u000f\u001a\u00020\u0010H\u0002J\u0010\u0010\u0018\u001a\u00020\u00192\u0006\u0010\u000f\u001a\u00020\u0010H\u0002J\u0010\u0010\u001a\u001a\u00020\u00192\u0006\u0010\u000f\u001a\u00020\u0010H\u0002J\u0010\u0010\u001b\u001a\u00020\u00192\u0006\u0010\u001c\u001a\u00020\u0017H\u0002J\u0010\u0010\u001d\u001a\u00020\u00192\u0006\u0010\u001e\u001a\u00020\u0017H\u0002R\u0011\u0010\u0002\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\u0005\u0010\u0006R\u001b\u0010\u0007\u001a\u00020\b8FX\u0086\u0084\u0002¢\u0006\f\n\u0004\b\u000b\u0010\f\u001a\u0004\b\t\u0010\n¨\u0006\u001f"}, d2 = {"Lai/platon/scent/extract/PageExtractor;", "", "conf", "Lai/platon/pulsar/common/config/ImmutableConfig;", "(Lai/platon/pulsar/common/config/ImmutableConfig;)V", "getConf", "()Lai/platon/pulsar/common/config/ImmutableConfig;", "fragmentExtractor", "Lai/platon/scent/extract/FragmentExtractor;", "getFragmentExtractor", "()Lai/platon/scent/extract/FragmentExtractor;", "fragmentExtractor$delegate", "Lkotlin/Lazy;", "extract", "Lai/platon/pulsar/dom/model/PageEntity;", "doc", "Lai/platon/pulsar/dom/FeaturedDocument;", "extractMetadata", "", "pageEntity", "extractMetadataByRegex", "extractRobotDefined", "getOriginalLink", "", "getPageDescription", "Lai/platon/pulsar/dom/model/PageAttribute;", "getPageKeywords", "getSourceLink", "baseUri", "getWebsiteDomain", "domain", "scent-extract"})
@SourceDebugExtension({"SMAP\nPageExtractor.kt\nKotlin\n*S Kotlin\n*F\n+ 1 PageExtractor.kt\nai/platon/scent/extract/PageExtractor\n+ 2 ExtractorBeanFactory.kt\nai/platon/scent/extract/common/extractor/ExtractorBeanFactory\n+ 3 BeanFactory.kt\nai/platon/pulsar/common/BeanFactory\n+ 4 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n+ 5 _Maps.kt\nkotlin/collections/MapsKt___MapsKt\n*L\n1#1,121:1\n15#2,7:122\n22#2:143\n23#2,4:147\n28#2:152\n45#3:129\n35#3,13:130\n48#3,4:153\n1549#4:144\n1620#4,2:145\n1622#4:151\n215#5,2:157\n215#5,2:159\n*S KotlinDebug\n*F\n+ 1 PageExtractor.kt\nai/platon/scent/extract/PageExtractor\n*L\n50#1:122,7\n50#1:143\n50#1:147,4\n50#1:152\n50#1:129\n50#1:130,13\n50#1:153,4\n50#1:144\n50#1:145,2\n50#1:151\n56#1:157,2\n60#1:159,2\n*E\n"})
/* loaded from: input_file:ai/platon/scent/extract/PageExtractor.class */
public final class PageExtractor {

    @NotNull
    private final ImmutableConfig conf;

    @NotNull
    private final Lazy fragmentExtractor$delegate;

    public PageExtractor(@NotNull ImmutableConfig immutableConfig) {
        Intrinsics.checkNotNullParameter(immutableConfig, "conf");
        this.conf = immutableConfig;
        this.fragmentExtractor$delegate = LangKt.usfLazy(new Function0<FragmentExtractor>() { // from class: ai.platon.scent.extract.PageExtractor$fragmentExtractor$2
            /* JADX INFO: Access modifiers changed from: package-private */
            {
                super(0);
            }

            @NotNull
            /* renamed from: invoke, reason: merged with bridge method [inline-methods] */
            public final FragmentExtractor m0invoke() {
                return new FragmentExtractor(PageExtractor.this.getConf());
            }
        });
    }

    @NotNull
    public final ImmutableConfig getConf() {
        return this.conf;
    }

    @NotNull
    public final FragmentExtractor getFragmentExtractor() {
        return (FragmentExtractor) this.fragmentExtractor$delegate.getValue();
    }

    @NotNull
    public final PageEntity extract(@NotNull FeaturedDocument featuredDocument) {
        Intrinsics.checkNotNullParameter(featuredDocument, "doc");
        throw new NotImplementedError((String) null, 1, (DefaultConstructorMarker) null);
    }

    private final void extractMetadata(FeaturedDocument featuredDocument, PageEntity pageEntity) {
        pageEntity.setBaseURI(featuredDocument.getLocation());
        pageEntity.setTitle(featuredDocument.getTitle());
        String location = featuredDocument.getLocation();
        if (StringsKt.startsWith$default(location, "/", false, 2, (Object) null)) {
            location = "file:" + featuredDocument.getLocation();
        }
        pageEntity.setLocation(location);
        pageEntity.put("Meta-ExtractedAt", DateTimes.now(), PageEntityKt.getMETADATA());
        String originalLink = getOriginalLink(featuredDocument);
        pageEntity.add(getSourceLink(originalLink));
        String internetDomainName = InternetDomainName.from(new URL(originalLink).getHost()).topPrivateDomain().toString();
        Intrinsics.checkNotNullExpressionValue(internetDomainName, "toString(...)");
        if (internetDomainName.length() > 0) {
            pageEntity.add(getWebsiteDomain(internetDomainName));
        }
        pageEntity.add(getPageKeywords(featuredDocument));
        pageEntity.add(getPageDescription(featuredDocument));
    }

    private final void extractMetadataByRegex(PageEntity pageEntity) {
        Object obj;
        Object call;
        ExtractorBeanFactory extractorBeanFactory = new ExtractorBeanFactory(this.conf);
        KClass<?> orCreateKotlinClass = Reflection.getOrCreateKotlinClass(ConfiguredRegexExtractor.class);
        String[] strArr = {extractorBeanFactory.getResourcePath(orCreateKotlinClass).toString()};
        String cacheId = extractorBeanFactory.cacheId(orCreateKotlinClass, new String[]{"EXTRACT"});
        ExtractorBeanFactory extractorBeanFactory2 = extractorBeanFactory;
        Object bean = extractorBeanFactory2.getObjectCache().getBean(cacheId);
        if (bean == null || !(bean instanceof ConfiguredRegexExtractor)) {
            obj = null;
        } else {
            Object bean2 = extractorBeanFactory2.getObjectCache().getBean(cacheId);
            if (bean2 == null) {
                throw new NullPointerException("null cannot be cast to non-null type ai.platon.scent.extract.common.extractor.regex.ConfiguredRegexExtractor");
            }
            obj = (ConfiguredRegexExtractor) bean2;
        }
        Object obj2 = obj;
        if (obj2 == null) {
            Collection<KFunction> constructors = orCreateKotlinClass.getConstructors();
            ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(constructors, 10));
            for (KFunction kFunction : constructors) {
                switch (kFunction.getParameters().size()) {
                    case 1:
                        call = kFunction.call(new Object[]{extractorBeanFactory.getConf()});
                        break;
                    case 2:
                        call = kFunction.call(new Object[]{extractorBeanFactory.getConf(), strArr});
                        break;
                    default:
                        throw new RuntimeException("Failed to load class " + orCreateKotlinClass);
                }
                arrayList.add((ConfiguredExtractor) call);
            }
            obj2 = (ConfiguredExtractor) CollectionsKt.first(arrayList);
            Intrinsics.checkNotNull(obj2);
            extractorBeanFactory2.putBean(cacheId, obj2);
        }
        ConfiguredRegexExtractor configuredRegexExtractor = (ConfiguredExtractor) obj2;
        String firstValue = pageEntity.firstValue("Meta-Title");
        String firstValue2 = pageEntity.firstValue("Meta-Description");
        if (firstValue != null) {
            for (Map.Entry<String, String> entry : configuredRegexExtractor.extract(BlockLabel.Companion.getMetadata().getText(), firstValue).entrySet()) {
                pageEntity.put(entry.getKey(), entry.getValue(), PageEntityKt.getMETADATA());
            }
        }
        if (firstValue2 != null) {
            for (Map.Entry<String, String> entry2 : configuredRegexExtractor.extract(BlockLabel.Companion.getMetadata().getText(), firstValue2).entrySet()) {
                PageEntity.put$default(pageEntity, entry2.getKey(), entry2.getValue(), "Meta-Description", (FragmentCategory) null, 8, (Object) null);
            }
        }
    }

    private final void extractRobotDefined(FeaturedDocument featuredDocument, PageEntity pageEntity) {
        Iterator it = FeaturedDocument.select$default(featuredDocument, ".satellite-inserted", 0, 0, 6, (Object) null).iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            String attr = element.attr("name");
            Intrinsics.checkNotNullExpressionValue(attr, "attr(...)");
            String attr2 = element.attr("value");
            Intrinsics.checkNotNullExpressionValue(attr2, "attr(...)");
            String str = attr2;
            String attr3 = element.attr("label");
            Intrinsics.checkNotNullExpressionValue(attr3, "attr(...)");
            if (StringsKt.equals(attr3, "Captured", true)) {
                String replace = new Regex("file://").replace(str, "");
                StringCompanionObject stringCompanionObject = StringCompanionObject.INSTANCE;
                Object[] objArr = {replace};
                String format = String.format("<div><img src='%s' /></div>", Arrays.copyOf(objArr, objArr.length));
                Intrinsics.checkNotNullExpressionValue(format, "format(...)");
                str = format;
            }
            PageEntity.put$default(pageEntity, attr, str, attr3, (FragmentCategory) null, 8, (Object) null);
        }
    }

    private final PageAttribute getWebsiteDomain(String str) {
        return new PageAttribute("Meta-Domain", str, (String) null, (String) null, (String) null, PageEntityKt.getMETADATA(), (String) null, (HashSet) null, 220, (DefaultConstructorMarker) null);
    }

    private final String getOriginalLink(FeaturedDocument featuredDocument) {
        String attr = featuredDocument.getBody().attr("data-url");
        Intrinsics.checkNotNullExpressionValue(attr, "attr(...)");
        if (!(attr.length() == 0)) {
            return attr;
        }
        String attr2 = FeaturedDocument.select$default(featuredDocument, "html head link[rel=canonical]", 0, 0, 6, (Object) null).attr("href");
        Intrinsics.checkNotNullExpressionValue(attr2, "attr(...)");
        return !(attr2.length() == 0) ? attr2 : featuredDocument.getLocation();
    }

    private final PageAttribute getSourceLink(String str) {
        return new PageAttribute("Meta-Link", str, (String) null, (String) null, (String) null, PageEntityKt.getMETADATA(), (String) null, (HashSet) null, 220, (DefaultConstructorMarker) null);
    }

    private final PageAttribute getPageKeywords(FeaturedDocument featuredDocument) {
        String attr = FeaturedDocument.select$default(featuredDocument, "html head meta[name=keywords]", 0, 0, 6, (Object) null).attr("content");
        Intrinsics.checkNotNullExpressionValue(attr, "attr(...)");
        return new PageAttribute("Meta-Keywords", attr, (String) null, (String) null, (String) null, PageEntityKt.getMETADATA(), (String) null, (HashSet) null, 220, (DefaultConstructorMarker) null);
    }

    private final PageAttribute getPageDescription(FeaturedDocument featuredDocument) {
        String attr = FeaturedDocument.select$default(featuredDocument, "html head meta[name=description]", 0, 0, 6, (Object) null).attr("content");
        Intrinsics.checkNotNullExpressionValue(attr, "attr(...)");
        return new PageAttribute("Meta-Description", attr, (String) null, (String) null, (String) null, PageEntityKt.getMETADATA(), (String) null, (HashSet) null, 220, (DefaultConstructorMarker) null);
    }
}
