import pandas as pd


netflix_titles = pd.read_csv('https://raw.githubusercontent.com/minaahayley/Python/main/data/netflix_titles.csv')


netflix_titles.sample()


netflix_titles[netflix_titles['rating'].str.contains('TV', na=False)].sample()


netflix_titles[~netflix_titles['rating'].str.contains('TV', na=False)].sample()


netflix_titles['rating'].str.split('-')

0       [PG, 13]
1       [TV, MA]
2       [TV, MA]
3       [TV, MA]
4       [TV, MA]
          ...   
8802         [R]
8803    [TV, Y7]
8804         [R]
8805        [PG]
8806    [TV, 14]
Name: rating, Length: 8807, dtype: object


netflix_titles['rating'].str.split('-', expand=True)


netflix_titles['rating'].str.split('-', expand=True)[0]

0       PG
1       TV
2       TV
3       TV
4       TV
        ..
8802     R
8803    TV
8804     R
8805    PG
8806    TV
Name: 0, Length: 8807, dtype: object


#Zombie를 포함하는 title 찾기  
zombie = [x for x in netflix_titles['title'] if x.find('Zombie') != -1]
zombie

['IZombie',
 'Rise of the Zombie',
 'Scooby-Doo on Zombie Island',
 'Zombie Dumb',
 'Zombieland']


netflix_titles.sample()


netflix_titles['added_year'] = pd.to_datetime(netflix_titles['date_added']).dt.year
netflix_titles['added_year']

0       2021.0
1       2021.0
2       2021.0
3       2021.0
4       2021.0
         ...  
8802    2019.0
8803    2019.0
8804    2019.0
8805    2020.0
8806    2019.0
Name: added_year, Length: 8807, dtype: float64


netflix_titles['release_added'] = netflix_titles['release_year'] - netflix_titles['added_year'] 
netflix_titles['release_added']

0       -1.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
8802   -12.0
8803    -1.0
8804   -10.0
8805   -14.0
8806    -4.0
Name: release_added, Length: 8807, dtype: float64


netflix_titles['release_added'] = netflix_titles['release_added'].astype(str)


pd.to_numeric(netflix_titles['release_added'])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "nan"

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-15-f88196129310> in <module>
----> 1 pd.to_numeric(netflix_titles['release_added'])

/usr/local/lib/python3.7/dist-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    182         try:
    183             values, _ = lib.maybe_convert_numeric(
--> 184                 values, set(), coerce_numeric=coerce_numeric
    185             )
    186         except (ValueError, TypeError):

/usr/local/lib/python3.7/dist-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "nan" at position 6066


netflix_titles['release_added'] = pd.to_numeric(netflix_titles['release_added'].replace({'nan' : None}))
netflix_titles['release_added']

0       -1.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
8802   -12.0
8803    -1.0
8804   -10.0
8805   -14.0
8806    -4.0
Name: release_added, Length: 8807, dtype: float64


netflix_titles['release_added'].unique()

array([ -1.,   0., -28.,  -3., -25., -23., -24., -11.,  -8.,  -4., -46.,
       -43., -38., -34.,  -9., -20.,  -7., -19., -18., -17., -10., -13.,
       -12., -14., -16., -15., -27.,  -6.,  -2.,  -5., -39., -32., -31.,
       -30., -22., -35., -29., -37., -41., -60., -21., -26., -36., -45.,
       -62., -33., -40., -49., -57., -76.,   1., -66., -64., -50., -47.,
       -44., -93., -51., -55., -48., -42.,   2.,  nan, -54., -59., -61.,
       -52., -63.,   3., -72., -71., -75., -65., -73., -70., -74.])


netflix_titles['country'].value_counts()

United States                             2818
India                                      972
United Kingdom                             419
Japan                                      245
South Korea                                199
                                          ... 
Romania, Bulgaria, Hungary                   1
Uruguay, Guatemala                           1
France, Senegal, Belgium                     1
Mexico, United States, Spain, Colombia       1
United Arab Emirates, Jordan                 1
Name: country, Length: 748, dtype: int64


netflix_titles['type'] = netflix_titles['country'].apply(lambda x : 'US' if x == 'United States' else 'Non_US')
netflix_titles[['type','country']]


netflix_titles['type'] = netflix_titles['country'].apply(lambda x : 'US' if x == 'United States' else 'India' if x == 'India' else 'Non-US/India')
netflix_titles[['type','country']]


netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1)


netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1).stack()

0     0                                        United States
1     0                                         South Africa
4     0                                                India
7     0    United States, Ghana, Burkina Faso, United Kin...
8     0                                       United Kingdom
                                 ...                        
8801  0                         United Arab Emirates, Jordan
8802  0                                        United States
8804  0                                        United States
8805  0                                        United States
8806  0                                                India
Length: 7976, dtype: object


netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1).stack().reset_index(level=1, drop=True)

0                                           United States
1                                            South Africa
4                                                   India
7       United States, Ghana, Burkina Faso, United Kin...
8                                          United Kingdom
                              ...                        
8801                         United Arab Emirates, Jordan
8802                                        United States
8804                                        United States
8805                                        United States
8806                                                India
Length: 7976, dtype: object


s = netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1).stack().reset_index(level=1, drop=True)
s[s != 'nan'].value_counts().head(20)

United States                    2818
India                             972
United Kingdom                    419
Japan                             245
South Korea                       199
Canada                            181
Spain                             145
France                            124
Mexico                            110
Egypt                             106
Turkey                            105
Nigeria                            95
Australia                          87
Taiwan                             81
Indonesia                          79
Brazil                             77
Philippines                        75
United Kingdom, United States      75
United States, Canada              73
Germany                            67
dtype: int64


netflix_titles = pd.read_csv('https://raw.githubusercontent.com/minaahayley/Python/main/data/netflix_titles.csv')


import numpy as np

netflix_titles['type'] = np.where(netflix_titles['country']=='US', 'US', 'Non_US')
netflix_titles[['type','country']]

Matplotlib Subplot (0)	2022.11.18
데이터의 빈도수 시각화 (0)	2022.11.18
Matplotlib, Seaborn 시각화 (0)	2022.11.18
시계열 데이터 (0)	2022.11.18
데이터 집계와 재구조화 (0)	2022.11.18

hayley

데이터 전처리

str.contains()¶

str.split()¶

str.find()¶

pd.to_numeric()¶

apply()¶

np.where()¶

'visualization' 카테고리의 다른 글

+ Recent posts

티스토리툴바

	0	1	2
0	PG	13	None
1	TV	MA	None
2	TV	MA	None
3	TV	MA	None
4	TV	MA	None
...	...	...	...
8802	R	None	None
8803	TV	Y7	None
8804	R	None	None
8805	PG	None	None
8806	TV	14	None

	type	country
0	US	United States
1	Non_US	South Africa
2	Non_US	NaN
3	Non_US	NaN
4	Non_US	India
...	...	...
8802	US	United States
8803	Non_US	NaN
8804	US	United States
8805	US	United States
8806	Non_US	India

	type	country
0	US	United States
1	Non-US/India	South Africa
2	Non-US/India	NaN
3	Non-US/India	NaN
4	India	India
...	...	...
8802	US	United States
8803	Non-US/India	NaN
8804	US	United States
8805	US	United States
8806	India	India