Facebook
From Sludgy Hedgehog, 1 Year ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 122
  1. a = (
  2.     domains_nameservers.select("*")
  3.     .fillna("1970-01-01", ["first_seen"])
  4.     .fillna("2022-12-31", ["last_seen"])
  5.     .groupBy("domain_id", "last_seen", "first_seen")
  6.     .agg(F.countDistinct("*").name("counter"))
  7.     .orderBy(F.asc("counter"))
  8. )
  9. import sys
  10.  
  11. import pyspark.sql.functions as func
  12. from pyspark.sql.window import Window
  13.  
  14. windowSpec = Window.partitionBy(F.col("domain_id")).orderBy(
  15.     F.col("last_seen"), F.col("first_seen")
  16. )
  17. b = a.select("*").withColumn(
  18.     "grp",
  19.     ((F.datediff(F.col("first_seen"), F.lag(F.col("last_seen"), 1).over(windowSpec)))),
  20. )
  21. c = b.select("*").withColumn(
  22.     "match",
  23.     F.when(
  24.         (
  25.             (
  26.                 F.floor(
  27.                     F.datediff(
  28.                         F.col("first_seen"),
  29.                         F.lag(F.col("last_seen"), 1).over(windowSpec),
  30.                     )
  31.                     / 91
  32.                 )
  33.             )
  34.         )
  35.         > 1,
  36.         "1",
  37.     ).otherwise("0"),
  38. )
  39. w = (
  40.     Window.partitionBy("domain_id")
  41.     .orderBy("last_seen", "first_seen")
  42.     .rowsBetween(Window.unboundedPreceding, Window.currentRow)
  43. )
  44. newDF = c.select("*").withColumn("val_sum", F.sum(F.col("match")).over(w))
  45. nom = (
  46.     newDF.select("*")
  47.     .groupBy("domain_id", "val_sum")
  48.     .agg(F.min("first_seen").name("first_seen"), F.max("last_seen").name("last_seen"))
  49.     .where("first_seen!='1970-01-01' and last_seen!='2022-12-31'")
  50. )
  51.  

Replies to Untitled rss

Title Name Language When
Re: Untitled Antonia python 1 Year ago.