* do e:\DHS\programs\recoding\CWI_recoding_do_3June2014.txt

* The executable lines begin after the multiple rows of asterisks;
*   everything before that is a subroutine.

* Modification to the logic on June 2: pair the final column with each of the
*   preceding columns and then do a reshape and merge, rather than doing
*   a reshape long on the original file


***************************************************************

program define reshape_excel_file

* This routine converts an Excel file to a Stata dta file that
*   lists out all the recodes for this country.


* The routine could be shortened substantially but it's not worth the trouble.

* First check the Excel file for this country to see how many surveys
*   are included; assume in the range from 1 to 6.

scalar scountry=trim(A[1])

capture confirm string variable D, exact 
if _rc==0 {
  scalar nsurveys=1
}

capture confirm string variable F, exact 
if _rc==0 {
  scalar nsurveys=2
}

capture confirm string variable H, exact 
if _rc==0 {
  scalar nsurveys=3
}

capture confirm string variable J, exact 
if _rc==0 {
  scalar nsurveys=4
}

capture confirm string variable L, exact 
if _rc==0 {
  scalar nsurveys=5
}

capture confirm string variable N, exact 
if _rc==0 {
  scalar nsurveys=6
}


scalar list nsurveys

drop A

* Now extract some scalars and rename some columns

if nsurveys>=1 {
scalar year_1    =trim(B[1])
scalar filename_1=trim(B[2])
rename B codes_1
drop C
}

if nsurveys>=2 {
scalar year_2    =trim(D[1])
scalar filename_2=trim(D[2])
rename D codes_2
drop E
}

if nsurveys>=3 {
scalar year_3    =trim(F[1])
scalar filename_3=trim(F[2])
rename F codes_3
drop G
}

if nsurveys>=4 {
scalar year_4    =trim(H[1])
scalar filename_4=trim(H[2])
rename H codes_4
drop I
}

if nsurveys>=5 {
scalar year_5    =trim(J[1])
scalar filename_5=trim(J[2])
rename J codes_5
drop K
}

if nsurveys>=6 {
scalar year_6    =trim(L[1])
scalar filename_6=trim(L[2])
rename L codes_6
drop M
}

if nsurveys==1 {
rename D recodes
}

if nsurveys==2 {
rename F recodes
}

if nsurveys==3 {
rename H recodes
}

if nsurveys==4 {
rename J recodes
}

if nsurveys==5 {
rename L recodes
}

if nsurveys==6 {
rename N recodes
}

/*
The remainder of this subroutine is not well documented.  Almost everything could be
  done differently and even better, but it is not worth the time to re-write it.

Here is the logic:

First break the input file into two parts.

The first part, with "r" in the label, consists just of the lines with variable names.
It is converted from wide to long, not with the reshape command, but with append. It has
  two columns, consisting of the variable names for the first survey, plus the recoded
  variables names; then the variable names for the second survey, plus the recoded variable
  names, etc.

The second part consists of just the lines with the code values and recode values.  It is
  converted from wide to long, with just two columns, in the same way.

Then the first and second parts are merged, giving four columns that consist of
 var, code, newvar, recode, all on one line (plus a variable to identify the survey). 

To break the input file into  the two  parts, we add two codes: 
  code1=1 will identify lines that consist of variable names;
  code2 numbers each block of lines that begins with a variable name

*/


gen str2 testvar=substr(recodes,1,2)

gen code1=0
replace code1=1 if testvar=="hv" | testvar=="sh"

gen code2=.
scalar scount=0

gen lines=_n
summarize lines
scalar nlines=r(max)

scalar sline=1
while sline<=nlines {

if code1[sline]==1 {
scalar scount=scount+1
}

replace code2=scount if line==sline

scalar sline=sline+1
}

* This file is the original plus code1 and code2
save temp.dta, replace


* make a long file of variable names and labels;
* first make a separate file for each survey

keep if code1==1
drop code1
sort code2
rename recodes newvar
save temp_r_wide.dta, replace

scalar ssurvey=1
while ssurvey<=nsurveys {
local lsurvey=ssurvey
use temp_r_wide.dta, clear
keep codes_`lsurvey' newvar code2
rename codes_`lsurvey' var
gen survey=ssurvey
save temp_r_`lsurvey'.dta, replace
scalar ssurvey=ssurvey+1
}

* append these surveys

use temp_r_1, clear
scalar ssurvey=2
while ssurvey<=nsurveys {
local lsurvey=ssurvey
append using temp_r_`lsurvey'.dta
scalar ssurvey=ssurvey+1
}

sort survey code2

* this is the long file of the first type, with variable names
save temp_r_long.dta, replace


* Now make a file of the values of the variables
use temp.dta, clear
drop if code1==1
save temp_wide.dta, replace

* form separate files that consist only of codes_1 and recodes; codes_2 and recodes; etc.;
*  then merge temp_r.dta with this

scalar ssurvey=1
while ssurvey<=nsurveys {
local lsurvey=ssurvey

use temp_wide.dta, clear

keep codes_`lsurvey' recodes code2
rename codes_`lsurvey' codes
gen survey=ssurvey
save temp_`lsurvey'.dta, replace

scalar ssurvey=ssurvey+1
}

* append these surveys
use temp_1, clear
scalar ssurvey=2
while ssurvey<=nsurveys {
local lsurvey=ssurvey
append using temp_`lsurvey'.dta
scalar ssurvey=ssurvey+1
}

* the current file is the long file of the second type, with variable values
* clean up the codes, convert to numeric

replace codes=substr(codes,1,2)
gen str1 col1=substr(codes,1,1)
gen str1 col2=substr(codes,2,1)
replace codes=col1 if col2=="."
drop if codes<="."
drop if codes>"99"
drop col1 col2
replace recodes=substr(recodes,1,2)
gen str1 col1=substr(recodes,1,1)
gen str1 col2=substr(recodes,2,1)
replace recodes=col1 if col2=="."
drop if recodes<="."
drop if recodes>"99"
drop col1 col2


destring codes, replace
destring recodes, replace
sort survey code2
save temp_long.dta, replace
merge survey code2 using temp_r_long.dta

drop _merge code2
replace var=substr(var,1,6)
replace var=regexr(var,"-"," ")
replace var=trim(var)
replace newvar=substr(newvar,1,6)
replace newvar=regexr(newvar,"-"," ")
replace newvar=trim(newvar)
replace newvar=newvar+"_r"

sort newvar recodes var codes survey

list newvar recodes var codes survey, table clean

save recodes.dta, replace

gen str2 cid="."
gen str2 pv="."
gen str9 year="."

scalar ssurvey=1
while ssurvey<=nsurveys {
local lsurvey=ssurvey
replace cid=substr(filename_`lsurvey',1,2) if survey==ssurvey
replace pv=substr(filename_`lsurvey',5,2) if survey==ssurvey
replace year=year_`lsurvey' if survey==ssurvey
scalar ssurvey=ssurvey+1
}

drop if codes==. | recodes==.

scalar scid=cid[1]
local lcid=scid
save `lcid'recodes.dta, replace

* This is the file that has the old variable name and value, and the new
*   variable name and value, with a separate line for each recode instruction
list, table clean
save recodes.dta, replace

end

***************************************************************
program define labels_and_01codes

* This routine can be expanded to include any labels and binary variables
*   that are needed.

label define hv201_r 11 "piped into dwelling" 12 "piped to yard/plot" 21 "well" 43 "river etc." 96 "other"

gen waterpipedin =hv201_r==11
gen waterpipedout=hv201_r==12
gen waterwellin  =hv201_r==41
gen waterriver   =hv201_r==43
gen waterother   =hv201_r==96
bysort hpv: tab hv201_r hv201

tab1 water*

* Insert similar lines for other constructed variables


* Example of output that can be used to confirm the recodes
sort hpv
by hpv: tab hv201 hv201_r,m
by hpv: tab hv205 hv205_r,m
by hpv: tab hv213 hv213_r,m

end

***************************************************************

program define make_recodes

* This routine lists the new output variables explicitly (required!)
*   and runs through the recode instructions in recodes.dta, line by line

cd c:\DHS\DHS_Data\HR_files  
 
local lcid=scid

use "`lcid'HR.dta", clear 
gen hv000r=substr(hv000,1,2)
gen str9 year="."


* must list here all the new output variables
gen hv201_r=.
gen hv205_r=.
gen hv206_r=.
gen hv207_r=.
gen hv208_r=.
gen hv209_r=.
gen hv210_r=.
gen hv211_r=.
gen hv212_r=.
gen hv213_r=.
gen hv243c_r=.
gen sh111o_r=.


scalar sline=1
while sline<=nlines{
local lline=sline
scalar     spv=spv_`lline'
local      lpv=spv

scalar svar=svar_`lline'
local  lvar=svar

scalar snewvar=snewvar_`lline'
local  lnewvar=snewvar

scalar   scode=scode_`lline'
local    lcode=scode

scalar srecode=srecode_`lline'
local  lrecode=srecode

scalar   syear=syear_`lline'
replace year=syear if hpv=="`lpv'"

scalar list sline spv svar snewvar scode srecode

replace `lnewvar'=`lrecode' if hpv=="`lpv'" & `lvar'==`lcode'

scalar sline=sline+1
}

labels_and_01codes

gen str20 country=scountry

save "`lcid'HR_recoded.dta", replace 

end

***************************************************************

program define combine_files

* This routine combines all of the HR data files for a specific country;
*   it is necessary to keep any and all of the variables that are being recoded

cd c:\DHS\DHS_Data\HR_files  
local lcid=cid
 
scalar ssurvey=1
while ssurvey<=nsurveys {
local lsurvey=ssurvey
scalar sfn=filename_`lsurvey'+".dta"
scalar list sfn
local lfn=sfn

use `lfn', clear

gen shpv=substr(sfn,5,2)

gen str2 hpv=shpv

* save space by dropping any subscripted variables, but keep all others
drop *_*

if ssurvey>1 {
append using "`lcid'HR.dta"
}

save "`lcid'HR.dta", replace
scalar ssurvey=ssurvey+1
}


* The following line may have to be country specific because of the way the variables are named
keep hpv hv000 hv001 hv002 hv201 hv205 hv206 hv207 hv208 hv210 hv213 hv214 hv215 hv225 sh*

save "`lcid'HR.dta", replace

end

***************************************************************
program define scalarize_recode_instructions

* This routine scalarizes the recode instructions in recodes.dta

cd e:\DHS\programs\recoding
use recodes.dta, clear

gen line=_n
summarize line
scalar nlines=r(max)

local lcid=scid

scalar sline=1
while sline<=nlines {
local lline=sline
scalar     spv_`lline'=pv[`lline']
scalar    svar_`lline'=var[`lline']
scalar snewvar_`lline'=newvar[`lline']
scalar   scode_`lline'=code[`lline']
scalar srecode_`lline'=recode[`lline']
scalar   syear_`lline'=year[`lline']

scalar sline=sline+1
}


end

***************************************************************

program define main

* This routine runs the routines separately for each sheet of the excel file

reshape_excel_file
combine_files
scalarize_recode_instructions
make_recodes
clear all

cd e:\DHS\programs\recoding
shell del *temp*.dta

end

***************************************************************
***************************************************************
***************************************************************
***************************************************************
***************************************************************
***************************************************************
* EXECUTION BEGINS HERE

/* 

The input file must include 

cid:    two-character string for country id
pv:     two-character string for columns 5-6 of the HR filename
var:    variable that is being recoded, a string such as hv201
code:   numerical origin code
recode: numerical destination code

Check the paths!!  

The workfiles and programs are in one location ("e:\DHS\programs\recoding")
The input files are assumed to be already dowloaded; they and the combined recoded
  output file are in another location ("c:\DHS\DHS_Data\HR_files")


*/


* cid is the two-digit country id code
* pv is the phase and version part of the filename

cd e:\DHS\programs\recoding
set logtype text
log using CWI_recoding_log_3June2014.txt, replace
set more off

* Set up a loop to run through all sheets if the excel file; need an
*  "import excel" line for each sheet, followed by "main", repeatedly

import excel Ghana2.xls
main