In [1]:
from IPython.display import HTML
HTML("""
<style>
/* ocultar la primera celda */
.jp-CodeCell:first-child {
display:none;
}
/* tipografía general */
body {
font-family: 'Segoe UI', Arial, sans-serif;
line-height: 1.7;
background:#fafafc;
}
/* títulos */
h1 {
color:#6B8EEC;
border-bottom:3px solid #BFD3FF;
padding-bottom:6px;
}
h2 {
color:#5A5A7A;
margin-top:30px;
}
/* párrafos */
p {
font-size:16px;
color:#444;
}
/* bloques de código */
.jp-CodeCell pre {
background:#f4f6ff !important;
color:#333 !important;
border-left:4px solid #9FB8FF;
padding:12px;
border-radius:6px;
}
/* resultados */
.jp-OutputArea pre {
background:#fdf6ff;
border-left:4px solid #D7B8FF;
padding:10px;
border-radius:6px;
}
/* tablas de pandas */
.dataframe {
border-collapse: collapse !important;
margin-top:15px;
background:white;
border-radius:8px;
overflow:hidden;
}
.dataframe th {
background:#C9D8FF !important;
color:#333 !important;
padding:8px;
}
.dataframe td {
padding:8px;
border-bottom:1px solid #eee;
}
/* cajas informativas */
.note {
background:#EEF3FF;
padding:15px;
border-radius:8px;
border-left:6px solid #9FB8FF;
}
/* listas */
ul {
line-height:1.8;
}
</style>
""")
Out[1]:
In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc
spark = SparkSession.builder.master("local").appName("Netflix").getOrCreate()
file_path = "netflix.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show()
print(df.columns)
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+ |show_id| type| title| director| cast| country| date_added|release_year|rating| duration| listed_in| description| +-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+ | s1| Movie|Dick Johnson Is Dead| Kirsten Johnson| NULL| United States|September 25, 2021| 2020| PG-13| 90 min| Documentaries|As her father nea...| | s2|TV Show| Blood & Water| NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021| 2021| TV-MA|2 Seasons|International TV ...|After crossing pa...| | s3|TV Show| Ganglands| Julien Leclercq|Sami Bouajila, Tr...| NULL|September 24, 2021| 2021| TV-MA| 1 Season|Crime TV Shows, I...|To protect his fa...| | s4|TV Show|Jailbirds New Orl...| NULL| NULL| NULL|September 24, 2021| 2021| TV-MA| 1 Season|Docuseries, Reali...|Feuds, flirtation...| | s5|TV Show| Kota Factory| NULL|Mayur More, Jiten...| India|September 24, 2021| 2021| TV-MA|2 Seasons|International TV ...|In a city of coac...| | s6|TV Show| Midnight Mass| Mike Flanagan|Kate Siegel, Zach...| NULL|September 24, 2021| 2021| TV-MA| 1 Season|TV Dramas, TV Hor...|The arrival of a ...| | s7| Movie|My Little Pony: A...|Robert Cullen, Jo...|Vanessa Hudgens, ...| NULL|September 24, 2021| 2021| PG| 91 min|Children & Family...|Equestria's divid...| | s8| Movie| Sankofa| Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021| 1993| TV-MA| 125 min|Dramas, Independe...|On a photo shoot ...| | s9|TV Show|The Great British...| Andy Devonshire|Mel Giedroyc, Sue...| United Kingdom|September 24, 2021| 2021| TV-14|9 Seasons|British TV Shows,...|A talented batch ...| | s10| Movie| The Starling| Theodore Melfi|Melissa McCarthy,...| United States|September 24, 2021| 2021| PG-13| 104 min| Comedies, Dramas|A woman adjusting...| | s11|TV Show|Vendetta: Truth, ...| NULL| NULL| NULL|September 24, 2021| 2021| TV-MA| 1 Season|Crime TV Shows, D...|"Sicily boasts a ...| | s12|TV Show| Bangkok Breaking| Kongkiat Komesiri|Sukollawat Kanaro...| NULL|September 23, 2021| 2021| TV-MA| 1 Season|Crime TV Shows, I...|Struggling to ear...| | s13| Movie| Je Suis Karl| Christian Schwochow|Luna Wedler, Jann...|Germany, Czech Re...|September 23, 2021| 2021| TV-MA| 127 min|Dramas, Internati...|After most of her...| | s14| Movie|Confessions of an...| Bruno Garotti|Klara Castanho, L...| NULL|September 22, 2021| 2021| TV-PG| 91 min|Children & Family...|When the clever b...| | s15|TV Show|Crime Stories: In...| NULL| NULL| NULL|September 22, 2021| 2021| TV-MA| 1 Season|British TV Shows,...|Cameras following...| | s16|TV Show| Dear White People| NULL|Logan Browning, B...| United States|September 22, 2021| 2021| TV-MA|4 Seasons|TV Comedies, TV D...|"Students of colo...| | s17| Movie|Europe's Most Dan...|Pedro de Echave G...| NULL| NULL|September 22, 2021| 2020| TV-MA| 67 min|Documentaries, In...|Declassified docu...| | s18|TV Show| Falsa identidad| NULL|Luis Ernesto Fran...| Mexico|September 22, 2021| 2020| TV-MA|2 Seasons|Crime TV Shows, S...|Strangers Diego a...| | s19| Movie| Intrusion| Adam Salky|Freida Pinto, Log...| NULL|September 22, 2021| 2021| TV-14| 94 min| Thrillers|After a deadly ho...| | s20|TV Show| Jaguar| NULL|Blanca Suárez, Iv...| NULL|September 22, 2021| 2021| TV-MA| 1 Season|International TV ...|In the 1960s, a H...| +-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+ only showing top 20 rows ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']
In [10]:
print("Total de registros:")
print(df.count())
Total de registros: 8809
In [7]:
print("Solo películas:")
df.filter(col("type") == "Movie").show()
Solo películas: +-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+ |show_id| type| title| director| cast| country| date_added|release_year|rating|duration| listed_in| description| +-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+ | s1|Movie|Dick Johnson Is Dead| Kirsten Johnson| NULL| United States|September 25, 2021| 2020| PG-13| 90 min| Documentaries|As her father nea...| | s7|Movie|My Little Pony: A...|Robert Cullen, Jo...|Vanessa Hudgens, ...| NULL|September 24, 2021| 2021| PG| 91 min|Children & Family...|Equestria's divid...| | s8|Movie| Sankofa| Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021| 1993| TV-MA| 125 min|Dramas, Independe...|On a photo shoot ...| | s10|Movie| The Starling| Theodore Melfi|Melissa McCarthy,...| United States|September 24, 2021| 2021| PG-13| 104 min| Comedies, Dramas|A woman adjusting...| | s13|Movie| Je Suis Karl| Christian Schwochow|Luna Wedler, Jann...|Germany, Czech Re...|September 23, 2021| 2021| TV-MA| 127 min|Dramas, Internati...|After most of her...| | s14|Movie|Confessions of an...| Bruno Garotti|Klara Castanho, L...| NULL|September 22, 2021| 2021| TV-PG| 91 min|Children & Family...|When the clever b...| | s17|Movie|Europe's Most Dan...|Pedro de Echave G...| NULL| NULL|September 22, 2021| 2020| TV-MA| 67 min|Documentaries, In...|Declassified docu...| | s19|Movie| Intrusion| Adam Salky|Freida Pinto, Log...| NULL|September 22, 2021| 2021| TV-14| 94 min| Thrillers|After a deadly ho...| | s23|Movie| Avvai Shanmughi| K.S. Ravikumar|Kamal Hassan, Mee...| NULL|September 21, 2021| 1996| TV-PG| 161 min|Comedies, Interna...|Newly divorced an...| | s24|Movie|Go! Go! Cory Cars...|Alex Woo, Stanley...|Maisie Benson, Pa...| NULL|September 21, 2021| 2021| TV-Y| 61 min|Children & Family...|From arcade games...| | s25|Movie| Jeans| S. Shankar|Prashanth, Aishwa...| India|September 21, 2021| 1998| TV-14| 166 min|Comedies, Interna...|When the father o...| | s27|Movie| Minsara Kanavu| Rajiv Menon|Arvind Swamy, Kaj...| NULL|September 21, 2021| 1997| TV-PG| 147 min|Comedies, Interna...|A tangled love tr...| | s28|Movie| Grown Ups| Dennis Dugan|Adam Sandler, Kev...| United States|September 20, 2021| 2010| PG-13| 103 min| Comedies|Mourning the loss...| | s29|Movie| Dark Skies| Scott Stewart|Keri Russell, Jos...| United States|September 19, 2021| 2013| PG-13| 97 min|Horror Movies, Sc...|A family’s idylli...| | s30|Movie| Paranoia| Robert Luketic|Liam Hemsworth, G...|United States, In...|September 19, 2021| 2013| PG-13| 106 min| Thrillers|Blackmailed by hi...| | s31|Movie| Ankahi Kahaniya|Ashwiny Iyer Tiwa...|Abhishek Banerjee...| NULL|September 17, 2021| 2021| TV-14| 111 min|Dramas, Independe...|As big city life ...| | s36|Movie|The Father Who Mo...| Daniel Sandu|Adrian Titieni, E...| NULL|September 17, 2021| 2021| TV-MA| 110 min|Dramas, Internati...|When his son goes...| | s37|Movie| The Stronghold| Cédric Jimenez|Gilles Lellouche,...| NULL|September 17, 2021| 2021| TV-MA| 105 min|Action & Adventur...|Tired of the smal...| | s39|Movie| Birth of the Dragon| George Nolfi|Billy Magnussen, ...|China, Canada, Un...|September 16, 2021| 2017| PG-13| 96 min|Action & Adventur...|A young Bruce Lee...| | s42|Movie| Jaws| Steven Spielberg|Roy Scheider, Rob...| United States|September 16, 2021| 1975| PG| 124 min|Action & Adventur...|When an insatiabl...| +-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+ only showing top 20 rows
In [8]:
print("Títulos por país:")
df.groupBy("country") \
.agg(count("*").alias("total_titulos")) \
.orderBy(desc("total_titulos")) \
.show()
Títulos por país: +--------------------+-------------+ | country|total_titulos| +--------------------+-------------+ | United States| 2805| | India| 972| | NULL| 832| | United Kingdom| 419| | Japan| 245| | South Korea| 199| | Canada| 181| | Spain| 145| | France| 123| | Mexico| 110| | Egypt| 106| | Turkey| 105| | Nigeria| 93| | Australia| 87| | Taiwan| 81| | Indonesia| 79| | Brazil| 77| |United Kingdom, U...| 75| | Philippines| 75| |United States, Ca...| 73| +--------------------+-------------+ only showing top 20 rows
In [9]:
print("Conteo por tipo:")
df.groupBy("type") \
.agg(count("*").alias("total")) \
.show()
Conteo por tipo: +-------------+-----+ | type|total| +-------------+-----+ | NULL| 1| | TV Show| 2676| | Movie| 6131| |William Wyler| 1| +-------------+-----+
In [11]:
from pyspark.sql.functions import col, count, desc
print("Top 10 directores con más títulos:")
df.filter(col("director").isNotNull()) \
.groupBy("director") \
.agg(count("*").alias("total_titulos")) \
.orderBy(desc("total_titulos")) \
.limit(10) \
.show()
Top 10 directores con más títulos: +--------------------+-------------+ | director|total_titulos| +--------------------+-------------+ | Rajiv Chilaka| 19| |Raúl Campos, Jan ...| 18| | Marcus Raboy| 16| | Suhas Kadav| 16| | Jay Karas| 14| | Cathy Garcia-Molina| 13| | Youssef Chahine| 12| | Martin Scorsese| 12| | Jay Chapman| 12| | Steven Spielberg| 11| +--------------------+-------------+
In [ ]: