# Download data 
wget https://raw.githubusercontent.com/slanglab/IndiaPoliceEvents/main/data/final/docs.jsonl
# .jsonl files are .json files with a new line for every observation
# In this case, every line is a single news artile 

# For every command-line tool we use you can look at its manual pages and examples via 
man head 
man tail 
man gshuf 

#etc

# Look at the first 10 lines using head 
head -n10 docs.jsonl

# Look at just the first line using head 
head -n1 docs.jsonl

# Hmmm that's not easy to see what's here 
# Let's use the jq tool which you can download here https://stedolan.github.io/jq/
head -n1 docs.jsonl | jq . 

# On the command line the vertical bar "|" is a "pipe", meaning we take the output 
#of the first part and send it through the second part 

# We can also look at the last line(s) with tail 
tail -n1 docs.jsonl | jq . 

# Let's throw away everything but the full text
# We can do this with a regular expression
head -n1 docs.jsonl | jq . | grep 'doc_text'

# Using grep in the line above says return every line that matches
# the string 'doc_text'

# Now let's get a random line of text 
# We use gshuf on Mac and shuf on Linux to shuffle the text 
gshuf docs.jsonl | head -n1 | jq . | grep 'doc_text'
#shuf docs.jsonl | head -n1 | jq . | grep 'doc_text' #use this command on Linux 

# Let's grep the date to confirm we're actually shuffling properly 
gshuf docs.jsonl | head -n1 | jq . | grep 'doc_id'

# Now let's shuffle the dataset, grab five random documents, 
# And write only their full text to disk 
gshuf docs.jsonl | head -n5 | jq . | grep 'doc_text' > sampledocs.txt

# Above, the > is output redirection 

# Let's examine the file to make sure we saved properly 
cat sampledocs.txt 

# We can use wc to count the number of lines in a file 
# Recall, we should have one line per document 
wc -l sampledocs.txt 

# Let's get rid of this "doc_text" garbage that came from the .jsonl 
# We'll use sed 
sed 's/"doc_text":/ /' sampledocs.txt

# Let's write this to disk as well 
sed 's/"doc_text":/ /' sampledocs.txt > sampledocs-clean.txt

# For fun let's translate this file to all uppercase using tr
tr "[:lower:]" "[:upper:]" < sampledocs-clean.txt

# The < above describes the input file 

# Notice the command above did not actually change the file itself 
cat sampledocs-clean.txt

# Now let's create a list of the words in file1, one per line, 
# where a word is taken to be a maximal string of letters.
tr -cs  "[:alpha:]" '\n' < sampledocs-clean.txt

# Above -c flag stands for complement (of our first string which in this)
# case is "[:alpha:]"
# The -s flag stands for squeeze so all '\n' get collapsed into one instance 

# We can sort the output 
tr -cs  "[:alpha:]" '\n' < sampledocs-clean.txt | sort 

# We can also sort in reverse order 
tr -cs  "[:alpha:]" '\n' < sampledocs-clean.txt | sort -r

# Now let's count the unique words 
tr -cs  "[:alpha:]" '\n' < sampledocs-clean.txt | sort | uniq -c 

# And sort again to look at the top 
tr -cs  "[:alpha:]" '\n' < sampledocs-clean.txt | sort | uniq -c | sort -nr > wordcounts.txt

# We can use less to start at the top and scroll through a file 
less wordcounts.txt 

# Let's put it all together to get the top word counts on the entire corpus 
cat docs.jsonl | jq . | grep 'doc_text' | sed 's/"doc_text":/ /' > alldocs_clean.txt
cat alldocs_clean.txt | tr -cs  "[:alpha:]" '\n'| sort | uniq -c | sort -nr > alldocs_wordcounts.txt


# Examine the top word counts 
# Next lecture we'll talk about "stop words" (e.g. the, of,to) 
# Compared to "content words" (e.g. government)
less alldocs_wordcounts.txt

# We can ignore words that we don't want 
grep -v "the" alldocs_wordcounts.txt | head -n10
# Above the -v command only outputs line where the pattern does NOT match

# Adding an -E to grep allows us to use regular expression
grep -v -E "the|in" alldocs_wordcounts.txt | head -n10

#### REGULAR EXPRESSIONS #### 
less alldocs_clean.txt
# regular expressions can help us easily search in a new corpus we're exploring 
grep -E --color "new delhi" alldocs_clean.txt
grep -E --color "gujarat" alldocs_clean.txt
grep -E --color "riots" alldocs_clean.txt

# Let's use or 
grep -E --color "new delhi|gujarat" alldocs_clean.txt