@@ -31,10 +31,6 @@ print(listdir(wksp_dir))
3131# Add folder to colab's path so we can import the helper functions
3232import sys
3333sys.path.insert(0 , wksp_dir)
34-
35- # Read the data back in.
36- from pandas import read_csv
37- data = read_csv(" /content/drive/My Drive/Colab Notebooks/text-analysis/data/data.csv" )
3834```
3935
4036~~~
@@ -57,193 +53,26 @@ Mounted at /content/drive
5753```
5854
5955### Load in the data
60- Create list of files we'll use for our analysis. We'll start by fitting a word2vec model to just one of the books in our list — Moby Dick.
61-
62- Get list of files available to analyze
63-
64- ``` python
65- from helpers import create_file_list
66- data_dir = wksp_dir + ' /data/books/'
67- corpus_file_list = create_file_list(data_dir, " *.txt" )
68- corpus_file_list[0 :5 ]
69- ```
70-
71- ~~~
72- ['/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dickens-bleakhouse.txt',
73- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dumas-blacktulip.txt',
74- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/austen-northanger.txt',
75- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/dickens-christmascarol.txt',
76- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/austen-persuasion.txt']
77- ~~~
78- {: .output}
79-
80- Parse filelist into a dataframe. Make sure you don't have any extra forward slashes in the pattern — this will cause an error in the helper function.
81-
82- ``` python
83- pattern = data_dir + " {Author} -{Title} .txt"
84- pattern
85- ```
86-
87- ~~~
88- '/content/drive/My Drive/Colab Notebooks/text-analysis/data/books/{Author}-{Title}.txt'
89- ~~~
90- {: .output}
9156
9257``` python
93- from helpers import parse_into_dataframe
94- data = parse_into_dataframe(data_dir + " {Author} - {Title} .txt " , corpus_file_list)
95- data.head( )
58+ # Read the data back in.
59+ from pandas import read_csv
60+ data = read_csv( " /content/drive/My Drive/Colab Notebooks/text-analysis/data/data.csv " )
9661```
9762
98-
99-
100- <div id =" df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c " >
101- <div class="colab-df-container">
102- <div>
103- <style scoped >
104- .dataframe tbody tr th :only-of-type {
105- vertical-align : middle ;
106- }
107-
108- .dataframe tbody tr th {
109- vertical-align : top ;
110- }
111-
112- .dataframe thead th {
113- text-align : right ;
114- }
115- </style >
116- <table border =" 1 " class =" dataframe " >
117- <thead >
118- <tr style="text-align: right;">
119- <th></th>
120- <th>Author</th>
121- <th>Title</th>
122- <th>File</th>
123- </tr>
124- </thead >
125- <tbody >
126- <tr>
127- <th>0</th>
128- <td>dickens</td>
129- <td>bleakhouse</td>
130- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
131- </tr>
132- <tr>
133- <th>1</th>
134- <td>dumas</td>
135- <td>blacktulip</td>
136- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
137- </tr>
138- <tr>
139- <th>2</th>
140- <td>austen</td>
141- <td>northanger</td>
142- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
143- </tr>
144- <tr>
145- <th>3</th>
146- <td>dickens</td>
147- <td>christmascarol</td>
148- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
149- </tr>
150- <tr>
151- <th>4</th>
152- <td>austen</td>
153- <td>persuasion</td>
154- <td>/content/drive/My Drive/Colab Notebooks/text-a...</td>
155- </tr>
156- </tbody >
157- </table >
158- </div >
159- <button class="colab-df-convert" onclick="convertToInteractive('df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c')"
160- title="Convert this dataframe to an interactive table."
161- style="display:none;">
162-
163- <svg xmlns="http://www.w3.org/2000/svg " height="24px"viewBox="0 0 24 24"
164- width="24px">
165- <path d =" M0 0h24v24H0V0z " fill =" none " />
166- <path d =" M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z " /><path d =" M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z " />
167- </svg >
168- </button>
169-
170- <style >
171- .colab-df-container {
172- display :flex ;
173- flex-wrap :wrap ;
174- gap : 12px ;
175- }
176-
177- .colab-df-convert {
178- background-color : #E8F0FE ;
179- border : none ;
180- border-radius : 50% ;
181- cursor : pointer ;
182- display : none ;
183- fill : #1967D2 ;
184- height : 32px ;
185- padding : 0 0 0 0 ;
186- width : 32px ;
187- }
188-
189- .colab-df-convert :hover {
190- background-color : #E2EBFA ;
191- box-shadow : 0px 1px 2px rgba (60 , 64 , 67 , 0.3 ), 0px 1px 3px 1px rgba (60 , 64 , 67 , 0.15 );
192- fill : #174EA6 ;
193- }
194-
195- [theme = dark ] .colab-df-convert {
196- background-color : #3B4455 ;
197- fill : #D2E3FC ;
198- }
199-
200- [theme = dark ] .colab-df-convert :hover {
201- background-color : #434B5C ;
202- box-shadow : 0px 1px 3px 1px rgba (0 , 0 , 0 , 0.15 );
203- filter : drop-shadow (0px 1px 2px rgba (0 , 0 , 0 , 0.3 ));
204- fill : #FFFFFF ;
205- }
206- </style >
207-
208- <script>
209- const buttonEl =
210- document.querySelector('#df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c button.colab-df-convert');
211- buttonEl.style.display =
212- google.colab.kernel.accessAllowed ? 'block' : 'none';
213-
214- async function convertToInteractive(key) {
215- const element = document.querySelector('#df-5f4a4787-9f3f-41ee-80d1-477fc6170a9c');
216- const dataTable =
217- await google.colab.kernel.invokeFunction('convertToInteractive',
218- [key], {});
219- if (!dataTable) return;
220-
221- const docLinkHtml = 'Like what you see? Visit the ' +
222- '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
223- + ' to learn more about interactive tables.';
224- element.innerHTML = '';
225- dataTable['output_type'] = 'display_data';
226- await google.colab.output.renderOutput(dataTable, element);
227- const docLink = document.createElement('div');
228- docLink.innerHTML = docLinkHtml;
229- element.appendChild(docLink);
230- }
231- </script>
232- </div>
233- </div >
234-
63+ Create list of files we'll use for our analysis. We'll start by fitting a word2vec model to just one of the books in our list — Moby Dick.
23564
23665``` python
23766single_file = data.loc[data[' Title' ] == ' moby_dick' ,' File' ].item()
23867single_file
239- ```
24068
69+ ```
24170~~~
24271'/content/drive/My Drive/Colab Notebooks/text-analysis/data/melville-moby_dick.txt'
24372~~~
24473{: .output}
24574
246- Let's preview the file contents to make sure our code so far is working correctly.
75+ Let's preview the file contents to make sure our code and directory setup is working correctly.
24776
24877``` python
24978# open and read file
0 commit comments